In [1]:
import pandas

DATA_PATH_TWEET = r'./annotated data/cleaned_annotated_tweets.csv'

annotated_tweets = pandas.read_csv(DATA_PATH_TWEET, header=None, names = ['tweet', 'label'])

In [2]:
import string

translate_table = dict((ord(char), None) for char in string.punctuation)

def remove_punctuation(str):
 return str.translate(translate_table)

annotated_tweets['tweet'] = annotated_tweets['tweet'].apply(remove_punctuation)

In [3]:
from sklearn.model_selection import train_test_split

X, y = annotated_tweets['tweet'], annotated_tweets['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.3, train_size=0.7)

In [4]:
import nltk
from nltk.corpus import stopwords

tokenizer = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=True)

set_stop_words = set(stopwords.words('english'))
set_stop_words.add('user')

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(lowercase=True,
                             ngram_range=(1, 2),
                             analyzer='word',
                             stop_words = set_stop_words,
                             tokenizer=tokenizer.tokenize) 

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

In [7]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

classifier = OneVsRestClassifier(LinearSVC(random_state=10))

In [8]:
from sklearn.pipeline import Pipeline

sentiment_pipeline = Pipeline([
        ('vectorizer', count_vect),
        ('tfidf', tfidf_transformer),
        ('classifier', classifier)
    ])

In [9]:
sentiment_fit = sentiment_pipeline.fit(X_train, y_train)

y_pred = sentiment_fit.predict(X_test) 

In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      anger       0.83      0.65      0.73       307
       fear       0.86      0.66      0.75       237
        joy       0.79      0.80      0.79      1868
    sadness       0.73      0.77      0.75      1836

avg / total       0.77      0.77      0.77      4248

