In [1]:
import pandas

DATA_PATH_TWEET = r'./annotated data/merged_tweets.csv'

annotated_tweets = pandas.read_csv(DATA_PATH_TWEET, header=None, names = ['tweet', 'label'])

Remove punctuation

In [2]:
import string

translate_table = dict((ord(char), None) for char in string.punctuation)

def remove_punctuation(str):
 return str.translate(translate_table)

annotated_tweets['tweet'] = annotated_tweets['tweet'].apply(remove_punctuation)

Split data to training and test set

In [3]:
from sklearn.model_selection import train_test_split

X, y = annotated_tweets['tweet'], annotated_tweets['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.3, train_size=0.7)

Import Twitter tokenizer and set of stop words

In [4]:
import nltk
from nltk.corpus import stopwords

tokenizer = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=True)

set_stop_words = set(stopwords.words('english'))

Define features

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(lowercase=True,
                             ngram_range=(1, 2),
                             analyzer='word',
                             stop_words = set_stop_words,
                             tokenizer=tokenizer.tokenize) 

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

Create classifier and pipeline

In [7]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

classifier = OneVsRestClassifier(LinearSVC(random_state=10))

In [8]:
from sklearn.pipeline import Pipeline

sentiment_pipeline = Pipeline([
        ('vectorizer', count_vect),
        ('tfidf', tfidf_transformer),
        ('classifier', classifier)
    ])

Train and predict

In [9]:
sentiment_fit = sentiment_pipeline.fit(X_train, y_train)

y_pred = sentiment_fit.predict(X_test)    

Score the results

In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      anger       0.94      0.82      0.87       368
       fear       0.86      0.72      0.78       236
        joy       0.81      0.83      0.82      1875
    sadness       0.78      0.79      0.79      1838

avg / total       0.81      0.81      0.81      4317



In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

classifier_log = LogisticRegression()

sentiment_pipeline_log_regr = Pipeline([
        ('vectorizer', count_vect),
        ('tfidf', tfidf_transformer),
        ('classifier', classifier_log)
    ])

In [12]:
from sklearn.metrics import accuracy_score

sentiment_fit_reg = sentiment_pipeline_log_regr.fit(X_train, y_train)

y_pred_reg = sentiment_fit_reg.predict(X_test)
    

In [13]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_reg))

             precision    recall  f1-score   support

      anger       0.95      0.33      0.49       368
       fear       0.98      0.19      0.31       236
        joy       0.75      0.82      0.78      1875
    sadness       0.70      0.80      0.75      1838

avg / total       0.76      0.74      0.72      4317

