In [1]:
import pandas as pd

col_names = ['sentiment','id','date','query_string','user','text']
data_path = 'data_engineering_tp1.csv'

tweet_data = pd.read_csv(data_path, header=None, names=col_names, encoding="ISO-8859-1").sample(frac=1) # .sample(frac=1) shuffles the data
tweet_data = tweet_data[['sentiment', 'text']] # Disregard other columns
print(tweet_data.head())

         sentiment                                               text
1031207          4  playing gears, on msn nd watching OTH talk abo...
640034           0  going out for dinner 2night with family i have...
1057372          4  @Mariurdaneta  Hiii Mari  Good afternoon! How ...
112203           0                              Sore  I should sleep.
297014           0  new moon trailer is beautiful! i dont think i ...


In [2]:
import re

allowed_chars = ' AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789~`!@#$%^&*()-=_+[]{}|;:",./<>?'
punct = '!?,.@#'
maxlen = 280

def preprocess(text):
    return ''.join([' ' + char + ' ' if char in punct else char for char in [char for char in re.sub(r'http\S+', 'http', text, flags=re.MULTILINE) if char in allowed_chars]])[:maxlen]


In [3]:
tweet_data['text'] = tweet_data['text'].apply(preprocess)

In [4]:
tweet_data['sentiment'] = '__label__'+tweet_data['sentiment'].astype(str)

In [5]:
print(tweet_data.head())

          sentiment                                               text
1031207  __label__4  playing gears ,  on msn nd watching OTH talk a...
640034   __label__0  going out for dinner 2night with family i have...
1057372  __label__4   @ Mariurdaneta  Hiii Mari  Good afternoon !  ...
112203   __label__0                            Sore  I should sleep . 
297014   __label__0  new moon trailer is beautiful !  i dont think ...


In [6]:
import os

# Create directory for saving data if it does not already exist
data_dir = './processed-data'
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

# Save a percentage of the data (you could also only load a fraction of the data instead)
amount = 0.125

tweet_data.iloc[0:int(len(tweet_data)*0.8*amount)].to_csv(data_dir + '/train.csv', sep="\t", index=False, header=False)
tweet_data.iloc[int(len(tweet_data)*0.8*amount):int(len(tweet_data)*0.9*amount)].to_csv(data_dir + '/test.csv', sep="\t", index=False, header=False)
tweet_data.iloc[int(len(tweet_data)*0.9*amount):int(len(tweet_data)*1.0*amount)].to_csv(data_dir + '/dev.csv', sep="\t", index=False, header=False)

In [10]:
from flair.datasets import ClassificationCorpus

corpus: ClassificationCorpus = ClassificationCorpus(Path(data_dir), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')

2020-11-05 23:29:56,944 Reading data from processed-data
2020-11-05 23:29:56,946 Train: processed-data\train.csv
2020-11-05 23:29:56,947 Dev: processed-data\dev.csv
2020-11-05 23:29:56,948 Test: processed-data\test.csv


In [13]:
len(corpus.test)

20000

In [11]:
label_dict = corpus.make_label_dictionary()

2020-11-05 23:30:17,981 Computing label dictionary. Progress:


100%|██████████| 180000/180000 [00:38<00:00, 4691.30it/s]

2020-11-05 23:31:21,640 [b'4', b'0']





In [14]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings

word_embeddings = [WordEmbeddings('glove')]

In [15]:
from flair.embeddings import DocumentRNNEmbeddings

document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

In [16]:
from flair.models import TextClassifier

classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [17]:
from flair.trainers import ModelTrainer

trainer = ModelTrainer(classifier, corpus)

In [18]:
trainer.train('model-saves',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=8,
              max_epochs=200)

2020-11-05 23:31:54,243 ----------------------------------------------------------------------------------------------------
2020-11-05 23:31:54,244 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=256, bias=True)
    (rnn): GRU(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2020-11-05 23:31:54,245 ----------------------------------------------------------------------------------------------------
2020-11-05 23:31:54,246 Corpus: "Corpus: 160000 train + 20000 dev + 20000 test sentences"
2020-11-05 23:31:54,247 ----------------------------------------------------------------------------------------------------
2020-11-05 2

{'test_score': 0.8028,
 'dev_score_history': [0.6936,
  0.7207,
  0.7517,
  0.7555,
  0.7608,
  0.7689,
  0.756,
  0.7705,
  0.7738,
  0.7576,
  0.7704,
  0.7669,
  0.7834,
  0.7802,
  0.7788,
  0.7762,
  0.7829,
  0.7816,
  0.7687,
  0.7882,
  0.786,
  0.7883,
  0.7852,
  0.7812,
  0.7914,
  0.794,
  0.7895,
  0.7895,
  0.793,
  0.7937,
  0.7924,
  0.7882,
  0.7934,
  0.7962,
  0.7963,
  0.7988,
  0.7966,
  0.7966,
  0.795,
  0.7953,
  0.7968,
  0.7996,
  0.7999,
  0.7987,
  0.8003,
  0.7984,
  0.8003,
  0.7967,
  0.7976,
  0.7992,
  0.8004,
  0.7976,
  0.7965],
 'train_loss_history': [0.624899322962761,
  0.5844964899837971,
  0.5692974395155906,
  0.5582426430583001,
  0.5494447723567486,
  0.5447354593813419,
  0.5392156910091639,
  0.5352061906158924,
  0.5320979223668575,
  0.5293366998761893,
  0.5268012016713619,
  0.5240344648748636,
  0.5212140038222075,
  0.5182830595999957,
  0.5182563773185015,
  0.5167611618697643,
  0.5150579145848752,
  0.5139886351495981,
  0.511726247

In [19]:
from flair.data import Sentence

classifier = TextClassifier.load('model-saves/final-model.pt')

pos_sentence = Sentence(preprocess('I love Python!'))
neg_sentence = Sentence(preprocess('Python is the worst!'))

classifier.predict(pos_sentence)
classifier.predict(neg_sentence)

print(pos_sentence.labels, neg_sentence.labels)

2020-11-06 14:49:49,932 loading file model-saves/final-model.pt
[4 (0.9676)] [0 (0.7939)]
