In [1]:
import pandas as pd

col_names = ['tweethash','sentiment','text']
data_path = 'SemEval2017-sentiment_english.csv'

tweet_data = pd.read_csv(data_path, header=None, names=col_names, encoding="ISO-8859-1").sample(frac=1) # .sample(frac=1) shuffles the data
tweet_data = tweet_data[['sentiment', 'text']] # Disregard other columns
print(tweet_data.head())

               sentiment                                               text
9605    __label__neutral  =@MyNintendoNews @Kurtoise13 may No longer nee...
19115  __label__negative  Amy Schumer sat down with The Hollywood Report...
19208   __label__neutral  Walking around Milan on a Thursday night, #Sto...
8513   __label__negative  @KuffarCoffee I'm talking only about Christian...
5561   __label__positive  my manga phase started in 3rd grade with Narut...


In [2]:
import re

allowed_chars = ' AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789~`!@#$%^&*()-=_+[]{}|;:",./<>?'
punct = '!?,.@#'
maxlen = 280

def preprocess(text):
    return ''.join([' ' + char + ' ' if char in punct else char for char in [char for char in re.sub(r'http\S+', 'http', text, flags=re.MULTILINE) if char in allowed_chars]])[:maxlen]


In [3]:
tweet_data['text'] = tweet_data['text'].apply(preprocess)

In [4]:
#tweet_data['sentiment'] = '__label__' + tweet_data['sentiment'].astype(str)

In [5]:
import os

# Create directory for saving data if it does not already exist
data_dir = './processed-data'
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

# Save a percentage of the data (you could also only load a fraction of the data instead)
amount = 0.125

tweet_data.iloc[0:int(len(tweet_data)*0.8*amount)].to_csv(data_dir + '/train.csv', sep='\t', index=False, header=False)
tweet_data.iloc[int(len(tweet_data)*0.8*amount):int(len(tweet_data)*0.9*amount)].to_csv(data_dir + '/test.csv', sep='\t', index=False, header=False)
tweet_data.iloc[int(len(tweet_data)*0.9*amount):int(len(tweet_data)*1.0*amount)].to_csv(data_dir + '/dev.csv', sep='\t', index=False, header=False)


In [6]:
from flair.data_fetcher import NLPTaskDataFetcher
from pathlib import Path

#corpus = NLPTaskDataFetcher.load_classification_corpus(Path(data_dir), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')


from flair.data import Corpus
from flair.datasets import ClassificationCorpus

# this is the folder in which train, test and dev files reside


# load corpus containing training, test and dev data
corpus: Corpus = ClassificationCorpus(Path(data_dir))

2020-03-10 11:17:21,119 Reading data from processed-data
2020-03-10 11:17:21,120 Train: processed-data/train.csv
2020-03-10 11:17:21,120 Dev: processed-data/dev.csv
2020-03-10 11:17:21,120 Test: processed-data/test.csv


In [7]:
label_dict = corpus.make_label_dictionary()

2020-03-10 11:17:21,147 Computing label dictionary. Progress:


100%|██████████| 2063/2063 [00:00<00:00, 3973.30it/s]

2020-03-10 11:17:21,725 [b'neutral', b'negative', b'positive']





In [8]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings

word_embeddings = [WordEmbeddings('glove'), 
                   FlairEmbeddings('news-forward'), 
                   FlairEmbeddings('news-backward')
                  ]

In [9]:
from flair.embeddings import DocumentRNNEmbeddings

document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

In [10]:
from flair.models import TextClassifier

classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [11]:
from flair.trainers import ModelTrainer

trainer = ModelTrainer(classifier, corpus)

In [12]:
trainer.train('model-saves',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=8,
              max_epochs=40)

2020-03-10 11:17:23,343 ----------------------------------------------------------------------------------------------------
2020-03-10 11:17:23,344 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=4196, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

{'test_score': 0.6008,
 'dev_score_history': [0.5233,
  0.5078,
  0.3992,
  0.531,
  0.4574,
  0.5543,
  0.4922,
  0.5233,
  0.4457,
  0.5543,
  0.5426,
  0.5698,
  0.5155,
  0.438,
  0.5543,
  0.4884,
  0.593,
  0.593,
  0.5891,
  0.5543,
  0.5349,
  0.6124,
  0.531,
  0.5853,
  0.4457,
  0.6085,
  0.5853,
  0.469,
  0.5969,
  0.5504,
  0.5388,
  0.531,
  0.5969,
  0.5969,
  0.5736,
  0.6008,
  0.5659,
  0.5775,
  0.5853,
  0.5853],
 'train_loss_history': [1.004712030520806,
  0.9557552704444299,
  0.9311877351540786,
  0.9095997929573059,
  0.8979718501751239,
  0.8706739865816556,
  0.8616885891327491,
  0.8527714087412908,
  0.8264533758163453,
  0.8400163375414335,
  0.8227337342042189,
  0.8095430530034579,
  0.7824475329655868,
  0.8082324954179617,
  0.7910528989938589,
  0.7834637366808378,
  0.7674535742172828,
  0.7642508415075449,
  0.7736767356212323,
  0.7530145425062913,
  0.7562482522084163,
  0.7530900340813856,
  0.7383536586394677,
  0.7297306950275715,
  0.743379745

In [21]:
from flair.data import Sentence
from flair.models import TextClassifier

import re

allowed_chars = ' AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789~`!@#$%^&*()-=_+[]{}|;:",./<>?'
punct = '!?,.@#'
maxlen = 280

def preprocess(text):
    return ''.join([' ' + char + ' ' if char in punct else char for char in [char for char in re.sub(r'http\S+', 'http', text, flags=re.MULTILINE) if char in allowed_chars]])[:maxlen]


classifier = TextClassifier.load('model-saves/best-model.pt')

pos_sentence = Sentence(preprocess('I love Python!'))
neg_sentence = Sentence(preprocess('Python is the worst!'))

classifier.predict(pos_sentence)
classifier.predict(neg_sentence)

print(pos_sentence.labels, "\n", neg_sentence.labels)


2020-03-10 19:13:04,333 loading file model-saves/best-model.pt
[positive (0.6372696757316589)] 
 [positive (0.38149163126945496)]


In [38]:
random_sentence = Sentence(preprocess("Merry Christmas to all those soldiers out there that couldn't make it home!"))
random_sentence2 = Sentence(preprocess("I love you all ❤"))
random_sentence3 = Sentence(preprocess("I'm bad good'"))
classifier.predict(random_sentence)
classifier.predict(random_sentence2)
classifier.predict(random_sentence3)

print(str(random_sentence.labels[0]).split()[:1][0])
print(str(random_sentence2.labels[0]).split()[:1][0])
print(str(random_sentence3.labels[0]).split()[:1][0])

negative
positive
neutral
