**Vanilla codes Created by Peter Nagy February 2017 <br/>
Growing Sets method implemented by Ömer Kurttekin April 2020** <br/>
[Nagy's Github][1] <br/>
[Kurttekin's Github][2] <br/><br/> 
[Nagy's Linkedin](https://www.linkedin.com/in/peternagyjob/) <br/>
**Sentiment Analysis:** the process of computationally identifying and categorizing opinions expressed in a piece of text, especially in order to determine whether the writer's attitude towards a particular topic, product, etc. is positive, negative, or neutral.

Dataset (Kaggle): [First GOP Debate Twitter Sentiment][3]

  [1]: https://github.com/nagypeterjob
  [2]: https://github.com/Omerktn
  [3]: https://www.kaggle.com/crowdflower/first-gop-debate-twitter-sentiment

In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import datetime

#from google.colab import drive
#drive.mount('/content/drive')

Only keeping the necessary columns.

In [0]:
path = "/<data_directory>/"
data = pd.read_csv(path + 'GOP_twitter_sent.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

Next, I am dropping the 'Neutral' sentiments as my goal was to only differentiate positive and negative tweets. After that, I am filtering the tweets so only valid texts and words remain.  Then, I define the number of max features as 2000 and use Tokenizer to vectorize and convert text into Sequences so the Network can deal with it as input.

In [0]:
data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

"""
To balance positive and negative labels, I created a little filter,
it limits the maximum number of label for each label type.
"""

limit_each_sentiment = 2000
poscount,negcount = 0,0
posdroplist, negdroplist = [],[]

for idx,row in data.iterrows():
  if row['sentiment'] == 'Positive':
    if poscount < limit_each_sentiment:
      poscount += 1
    else:
      posdroplist.append(idx)
  else:
    if negcount < limit_each_sentiment:
      negcount += 1
    else:
      negdroplist.append(idx)

print(negdroplist)
data = data.drop(negdroplist + posdroplist)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

Next, I compose the LSTM Network. Note that **embed_dim**, **lstm_out**, **batch_size**, **droupout_x** variables are hyperparameters, their values are somehow intuitive, can be and must be played with in order to achieve good results. Please also note that I am using softmax as activation function. The reason is that our Network is using categorical crossentropy, and softmax is just the right activation method for that.

In [0]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Hereby I declare the train and test dataset.

In [0]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [0]:
def get_losses(x_in, y_in, model):
  loss_arr = []
  for i in range(len(x_in)):
    loss_arr.append(model.evaluate(x_in[i:i+1], y_in[i:i+1], batch_size=1, verbose=0)[0])
  return loss_arr

In [0]:
def train_with_growing_sets(X_train, Y_train, X_test, Y_test, model, sets_method=None, 
                            data_limit=None, div=16, num_epochs=30, batch_size=128):
  """
  sets_method:  (String) Name of the growing sets method you want the train with. e.g. "SPL","SPLI","ROGS","BASE"
  data_limit: (Int) If you dont want to feed the model with all of your data, you can limit it.
  div:  (Int) How many pieces you want to split your data. Those pieces will add up together in every step.
  num_epochs:  (Int) Epoch for each training set.
  """
  custom_methods = ["spl", "spli", "rogs", "base"]

  is_str = isinstance(sets_method, str)
  if is_str:
    sets_method = sets_method.lower()
  if not is_str  or sets_method not in custom_methods:
    print("Sets method '{}' cannot recognized.".format(sets_method))
    return

  if not data_limit:
    data_limit = len(X_train)

  X_train, Y_train = X_train[:data_limit], Y_train[:data_limit]

  print("Method: {} - Div: {} - Epoch: {}".format(sets_method.upper(),k_div,num_epochs))
  start_t = datetime.datetime.now().replace(microsecond=0)

  slen = int(len(X_train)/k_div)
  spl_accs,rspl_accs,rogs_accs,accs  = [],[],[],[]
  sizes = []

  if sets_method == "spl":
    # Self Paced Learning
    for i in range(k_div):
      losses = get_losses(X_train, Y_train, model)
      sorted_loss_indexes = np.argsort(losses)
      
      x_tmp_spc = []
      y_tmp_spc = []
      for i in range(i * slen + slen):
        x_tmp_spc.append(X_train[sorted_loss_indexes[i]])
        y_tmp_spc.append(Y_train[sorted_loss_indexes[i]])

      x_spc = np.asarray(x_tmp_spc)
      y_spc = np.asarray(y_tmp_spc)

      hist = model.fit(x_spc, y_spc, epochs=num_epochs, shuffle=False, batch_size=batch_size, verbose=0)
      print("Instant LEN: {}".format(len(x_spc)))

      inst_acc = model.evaluate(X_test, Y_test)[1]
      print("Acc: {}".format(inst_acc))
      spl_accs.append(inst_acc)
      sizes.append(len(x_spc))
    print("{}_accs = {}".format(sets_method, spl_accs))

  elif sets_method == "spli":
    # Reversed Self Paced Learning
    for i in range(k_div):
      losses = get_losses(X_train, Y_train, model)
      sorted_loss_indexes = np.argsort(losses)[::-1]
      
      x_tmp_spc = []
      y_tmp_spc = []
      for i in range(i * slen + slen):
        x_tmp_spc.append(X_train[sorted_loss_indexes[i]])
        y_tmp_spc.append(Y_train[sorted_loss_indexes[i]])

      x_spc = np.asarray(x_tmp_spc)
      y_spc = np.asarray(y_tmp_spc)

      hist = model.fit(x_spc, y_spc, epochs=num_epochs, shuffle=False, batch_size=batch_size, verbose=0)
      print("Instant LEN: {}".format(len(x_spc)))

      inst_acc = model.evaluate(X_test, Y_test)[1]
      print("Acc: {}".format(inst_acc))
      rspl_accs.append(inst_acc)
      sizes.append(len(x_spc))
    print("{}_accs = {}".format(sets_method, rspl_accs))

  elif sets_method == "rogs":
    # Random Ordered Growing Datasets
    for i in range(k_div):
      x_part = X_train[: i * slen + slen]
      y_part = Y_train[: i * slen + slen]
      hist = model.fit(x_part, y_part, epochs=num_epochs, shuffle=False, batch_size=batch_size, verbose=0)
      print("Instant LEN: {}".format(len(x_part)))

      inst_acc = model.evaluate(X_test, Y_test)[1]
      print("Acc: {}".format(inst_acc))
      rogs_accs.append(inst_acc)
      sizes.append(len(x_part))
    print("{}_accs = {}".format(sets_method, rogs_accs))

  elif sets_method == "base":
    # Baseline training
    for _ in range(num_epochs):
      hist = model.fit(X_train, Y_train, epochs=1, shuffle=True, batch_size=batch_size, verbose=0) 

      inst_acc = model.evaluate(X_test, Y_test)[1]
      print("Acc: {}".format(inst_acc))
      accs.append(inst_acc)
      sizes.append(len(X_train))
    print("{}_accs = {}".format(sets_method, accs))

  end_t = datetime.datetime.now().replace(microsecond=0)
  print("{}_sizes = {}".format(sets_method, sizes))
  print("Time elapsed: {}".format(end_t - start_t))

In [0]:
num_epochs = 10
k_div=10
batch_size = 400

train_with_growing_sets(X_train, Y_train, X_test, Y_test, model, sets_method="SPLI",
                            data_limit=100, div=k_div, num_epochs=num_epochs, batch_size=batch_size)

In [0]:
# LOAD MODEL
name = "model_name"
modelpath = path + "/models/" + name

# Model reconstruction from JSON file
with open(modelpath + ".json", 'r') as f:
    model = model_from_json(f.read())
# Load weights into the new model
model.load_weights(modelpath + ".h5")
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [0]:
#SAVE MODEL
name = "model_name"
modelpath = "/<data_directory>/" + name
print(modelpath)
model.save_weights(modelpath + ".h5")
with open(modelpath + ".json", 'w') as f:
    f.write(model.to_json())