### Sentiment Analysis - GloVe100

In this notebook, we work with the `GloVe100` embedding representations of tweets, in combination with other simple features that are engineered from the raw tweet string.

In [50]:
import pandas as pd
import numpy as np

import ast

from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler
from math import floor, sqrt

### Data Loading & Preprocessing

We do the usual data loading and preprocessing, we apply a min-max scaling of the data to bring all feature values to a $[-1, 1]$ range. 

In [35]:
def load_data(df_csv) -> pd.DataFrame:
    # loads df_glove data
    df_glove = []

    for i in range(df_csv.shape[0]):
        glove = ast.literal_eval(df_csv['tweet'].iloc[i])
        df_glove.append(glove)
        
    return pd.DataFrame(df_glove, columns=["f"+str(i) for i in range(1, 101)]), df_csv['sentiment']

df_from_csv = pd.read_csv("train_glove.csv")
X, y = load_data(df_from_csv)

In [52]:
stop_words = set(stopwords.words('english')) # we create this as a global variable
    

def num_chars(tweet: str):
    return len(tweet)


def num_words(tweet: str):
    return len(tweet.split())


def num_cap_words(tweet: str):
    return sum(map(str.isupper, tweet.split()))


def num_hashtags(tweet: str): 
    return tweet.count('#')


def num_mentions(tweet: str):
    return tweet.count('@')


def num_stopwords(tweet: str):  
    word_tokens = word_tokenize(tweet)
    stopwords_tweet = [w for w in word_tokens if w in stop_words]
    return len(stopwords_tweet)

def num_punctuations(tweet: str):
    punctuations='!"$%&\'()*+,-./:;<=>?[\]^_`{|}~'
    count = 0
    for char in tweet:
        if char in punctuations: count += 1
    return count


engineered_features = [num_chars, num_words, num_cap_words, num_hashtags, num_mentions, 
                       num_stopwords, num_punctuations]

In [71]:
def load_raw_features(filename: str):
    df_raw = pd.read_csv(filename)
    
    for feature in engineered_features:         
        df_raw[feature.__name__] = df_raw["tweet"].apply(lambda x: feature(x))
    
    df_raw['avg_wordlength'] = df_raw['num_chars']/df_raw['num_words']    
    df_raw.drop(['sentiment', 'tweet_id', 'tweet'], axis=1, inplace=True)
    return df_raw


def preprocess(df: pd.DataFrame):
    # Preprocessing applied a min-max scaling of raw engineered features
    min_max_scaler = MinMaxScaler()
    scaled = min_max_scaler.fit_transform(df)
    return pd.DataFrame(scaled, columns = [feature.__name__ for feature in engineered_features]+['avg_wordlength'])


X_raw    = load_raw_features("train_full.csv")
X_raw_scaled = preprocess(X_raw)

In [73]:
# create full data set by merging orginal X with raw features
assert X_raw.shape[0] == X.shape[0]

X_full        = pd.merge(X, X_raw, left_index=True, right_index=True)
X_full_scaled = pd.merge(X, X_raw_scaled, left_index=True, right_index=True)

### Models

We now fit the datasets to a variety of traditional statistical learning algorithms

In [74]:
from sklearn.linear_model import LogisticRegression

l2_logistic_raw = LogisticRegression(penalty='l2', random_state=0, max_iter=10000).fit(X_full, y)
l1_logistic_raw = LogisticRegression(penalty='none',random_state=0, max_iter=10000).fit(X_full, y)

l2_logistic_std = LogisticRegression(penalty='l2', random_state=0, max_iter=10000).fit(X_full_scaled, y)
l1_logistic_std = LogisticRegression(penalty='none',random_state=0, max_iter=10000).fit(X_full_scaled, y)

In [76]:
from sklearn.naive_bayes import GaussianNB

gaussian_nb_raw = GaussianNB().fit(X_full, y)
gaussian_nb_std = GaussianNB().fit(X_full_scaled, y)

In [77]:
from sklearn.neighbors import KNeighborsClassifier

L2_knn_raw = KNeighborsClassifier(n_neighbors=floor(sqrt(X.shape[0]))).fit(X_full, y)
L2_knn_std = KNeighborsClassifier(n_neighbors=floor(sqrt(X.shape[0]))).fit(X_full_scaled, y)

In [78]:
from sklearn.ensemble import RandomForestClassifier

RF_raw = RandomForestClassifier(max_depth=None, random_state=0).fit(X_full, y)
RF_std = RandomForestClassifier(max_depth=None, random_state=0).fit(X_full_scaled, y)

### Model Evaluation

We now evaluate model performance

In [37]:
def transformation_pipeline(X: pd.DataFrame):
    return load_data(X)

In [81]:
TrainingSet = pd.read_csv('dev_glove.csv')
X_test, y_test = transformation_pipeline(TrainingSet)

X_test_raw = load_raw_features('dev_full.csv')
X_test_raw_scaled = preprocess(X_test_raw)


X_test_full = pd.merge(X_test, X_test_raw, left_index=True, right_index=True)
X_test_full_scaled = pd.merge(X_test, X_test_raw_scaled, left_index=True, right_index=True)
print(f"{X_test.shape[0]} testing instances, {X_test_full.shape[1]} features")

19906 testing instances, 108 features


In [84]:
from sklearn.metrics import f1_score, accuracy_score

l2_lr_pred     = l1_logistic_raw.predict(X_test_full)
l2_lr_pred_std = l1_logistic_std.predict(X_test_full_scaled)

print(f"Raw Logistic Regression Accuary score : {accuracy_score(l2_lr_pred, y_test)}")
print(f"Raw Logistic Regression f1 score      : {f1_score(l2_lr_pred, y_test, average='micro')}")
print(f"Std Logistic Regression Accuary score : {accuracy_score(l2_lr_pred_std, y_test)}")
print(f"Std Logistic Regression f1 score      : {f1_score(l2_lr_pred_std, y_test, average='micro')}\n")


nb_pred     = gaussian_nb_raw.predict(X_test_full)
nb_pred_std = gaussian_nb_std.predict(X_test_full_scaled)
print(f"Raw Gaussian Naive Bayes Accuary score   : {accuracy_score(nb_pred, y_test)}")
print(f"Raw Gaussian Naive Bayes f1 score        : {f1_score(nb_pred, y_test, average='micro')}")
print(f"Std Gaussian Naive Bayes Accuary score   : {accuracy_score(nb_pred_std, y_test)}")
print(f"Std Gaussian Naive Bayes f1 score        : {f1_score(nb_pred_std, y_test, average='micro')}\n")


knn_pred     = L2_knn_raw.predict(X_test_full)
knn_pred_std = L2_knn_std.predict(X_test_full_scaled)
print(f"Raw Gaussian Naive Bayes Accuary score   : {accuracy_score(knn_pred, y_test)}")
print(f"Raw Gaussian Naive Bayes f1 score        : {f1_score(knn_pred, y_test, average='micro')}")
print(f"Std Gaussian Naive Bayes Accuary score   : {accuracy_score(knn_pred_std, y_test)}")
print(f"Std Gaussian Naive Bayes f1 score        : {f1_score(knn_pred_std, y_test, average='micro')}\n")


RF_raw_pred = RF_raw.predict(X_test_full)
RF_std_pred = RF_std.predict(X_test_full_scaled)
print(f"Raw RF Accuary score   : {accuracy_score(RF_raw_pred, y_test)}")
print(f"Raw RF f1 score        : {f1_score(RF_raw_pred, y_test, average='micro')}")
print(f"Std RF Accuary score   : {accuracy_score(RF_std_pred, y_test)}")
print(f"Std RF f1 score        : {f1_score(RF_std_pred, y_test, average='micro')}")

Raw Logistic Regression Accuary score : 0.6962222445493821
Raw Logistic Regression f1 score      : 0.6962222445493821
Std Logistic Regression Accuary score : 0.6888877725308952
Std Logistic Regression f1 score      : 0.6888877725308952

Raw Gaussian Naive Bayes Accuary score   : 0.4594092233497438
Raw Gaussian Naive Bayes f1 score        : 0.4594092233497438
Std Gaussian Naive Bayes Accuary score   : 0.46046418165377273
Std Gaussian Naive Bayes f1 score        : 0.4604641816537728

Raw Gaussian Naive Bayes Accuary score   : 0.6218225660604842
Raw Gaussian Naive Bayes f1 score        : 0.6218225660604842
Std Gaussian Naive Bayes Accuary score   : 0.613935496835125
Std Gaussian Naive Bayes f1 score        : 0.613935496835125

Raw RF Accuary score   : 0.682507786597006
Raw RF f1 score        : 0.682507786597006
Std RF Accuary score   : 0.680548578318095
Std RF f1 score        : 0.680548578318095
