### Install Dependencies

In [1]:
!pip install kaggle contractions



### Import Dependencies

In [2]:
import os
os.environ['KAGGLE_USERNAME'] = 'spyrosmouselinos'
os.environ['KAGGLE_KEY'] = 'a907fb69eab07900ccb6e1f2874fd343'

import re
import contractions
import numpy as np
import pandas as pd

import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Connect to Kaggle API and download dataset

In [0]:
api = KaggleApi()
api.authenticate()


if not os.path.exists('train.csv'):
    api.competition_download_file('twitter-sentiment-analysis2','train.csv')
    zf = ZipFile('train.csv.zip', 'r')
    zf.extractall('./')
    zf.close()
    os.remove('train.csv.zip')

data = pd.read_csv('train.csv', delimiter=',',  encoding='latin-1')
data.drop(columns='ItemID', inplace=True)

### Text Preprocessing / Splitting

In [0]:
data['SentimentText'] = data['SentimentText'].str.lower()

In [0]:
def convert_emojis(sentence):
    # Converts known emojis to sentiment words

    # :) --> happyface
    # (: --> happyface
    # :p --> happyface
    # :P --> happyface
    # ;p --> happyface
    # :] --> happyface
    # [: --> happyface

    # :o --> surpriseface
    # :O --> surpriseface

    # :( --> sadface
    # ): --> sadface
    # :'( --> sadface
    # :S -->  sadface
    # :\ --> sadface
    # :[ --> sadface
    # ]: ---> sadface
    return


def preprocess(sentence):
    # Convert to Lower Case
    sentence = sentence.lower()

    # Replace Contractions
    sentence = contractions.fix(sentence, slang=True)

    # Remove Links
    sentence = re.sub(r'(http|https|www)\S+', ' ', sentence)

    # Remove usernames
    sentence = re.sub(r'@\w+', ' ', sentence)

    # Remove non-word characters
    sentence = re.sub(r'\W', ' ', sentence)

    # Remove underscores 
    sentence = re.sub(r'[-_]', ' ', sentence)

    # Remove numbers
    sentence = re.sub(r'[0-9]', ' ', sentence)
    
    # Remove all single characters
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)

    # Substituting multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)

    # Lemmatization
    sentence = sentence.split()

    sentence = [lemmatizer.lemmatize(word) for word in sentence]
    sentence = ' '.join(sentence)
    return sentence

In [0]:
data['SentimentText'] = data['SentimentText'].apply(lambda x : preprocess(x))

In [0]:
### Convert into lists and split
y = data['Sentiment'].values
x = data['SentimentText'].values

In [0]:
from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, shuffle=True)

### Features 
* TF
* TF-IDF

### Classifiers
* Dummy
* Logistic Regression
* Logistic Regression SGD
* Naive Bayes
* KNN
* SVM 
* MLP
* Random Forest

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC as SVM

### Dummy Classifier (Input/Tuning is Irrelevant) ✔

In [0]:
# Make classifier
base = DummyClassifier(strategy='most_frequent')

# Fit on Train Data
base.fit(np.zeros_like(y_train), y_train)

# Make Predictions on Training Set
predictions = base.predict(np.zeros_like(y_train))

score = accuracy_score(y_train, predictions)
print("train accuracy: %.2f%%" % (score*100))

# Make Predictions on Test Set
predictions_test = base.predict(np.zeros_like(y_test))
score = accuracy_score(y_test, predictions_test)
print("test accuracy: %.2f%%" % (score*100))


print("Test data confusion matrix")
y_true = pd.Series(y_test, name='True')
y_pred = pd.Series(predictions_test, name='Predicted')
pd.crosstab(y_true, y_pred)

### Logistic Regression

In [0]:
pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=10, max_features=5000, stop_words=stopwords.words('english'))),
    ('tfidf', TfidfTransformer()),
    ('dim_reduction', TruncatedSVD()),
    ('clf', LogisticRegression(class_weight='balanced'))
])

parameters = {
    'vect__max_df': (0.6, 0.7),
    'vect__ngram_range': ((1,2),(1,3)),
    'dim_reduction__n_components': (100, 500, 1000),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': (1.0, 0.1, 0.01),
}

clf = GridSearchCV(pipeline, parameters, scoring='f1', cv=3, n_jobs=-1)
clf.fit(x_train, y_train)
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)

In [0]:
# Results of Grid Search
# Best Score:  0.762361919024294
# Best Params:  {'clf__C': 1.0, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.7, 'vect__min_df': 0.0, 'vect__ngram_range': (1, 3)}

### Logistic Regression SGD

In [0]:
pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=10, max_features=5000,stop_words=stopwords.words('english'))),
    ('tfidf', TfidfTransformer()),
    ('dim_reduction', TruncatedSVD()),
    ('clf', SGDClassifier(loss="log", max_iter=1000, class_weight='balanced'))
])

parameters = {
    'vect__max_df': (0.6, 0.7),
    'vect__ngram_range': ((1,2), (1,3)),  
    'dim_reduction__n_components': (100,500,1000),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__penalty':('l1','l2'),
    'clf__alpha':(0.01, 0.001, 0.0001)
}

clf = GridSearchCV(pipeline, parameters, scoring='f1', cv=3, n_jobs=-1)
clf.fit(x_train, y_train)
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)

In [0]:
# Best Score:  0.7605481382673948
# Best Params:  {'clf__alpha': 0.0001, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 1.0, 'vect__min_df': 0.0, 'vect__ngram_range': (1, 2)}

### Naive Bayes

In [0]:
pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=10, max_features=5000, stop_words=stopwords.words('english'))),
    ('tfidf', TfidfTransformer()),
    ('dim_reduction', TruncatedSVD()),
    ('clf', MultinomialNB(fit_prior=True))
])

parameters = {
    'vect__max_df': (0.6, 0.7),
    'vect__ngram_range': ((1, 2), (1,3)),
    'dim_reduction__n_components': (100,500,1000),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (1.0, 0.1, 0.01)
}

clf = GridSearchCV(pipeline, parameters, scoring='f1', cv=3, n_jobs=-1)
clf.fit(x_train, y_train)
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)

### KNN

In [0]:
pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=10,max_features=5000,stop_words=stopwords.words('english'))),
    ('tfidf', TfidfTransformer()),
    ('dim_reduction', TruncatedSVD()),
    ('clf', KNeighborsClassifier())
])

parameters = {
    'vect__max_df': (0.6, 0.7),
    'vect__ngram_range': ((1, 2), (1,3)),  
    'dim_reduction__n_components': (100,500,1000), 
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__n_neighbors': (3,5,7)
}

clf = GridSearchCV(pipeline, parameters, scoring='f1', cv=3, n_jobs=-1)
clf.fit(x_train, y_train)
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)

### SVM

In [0]:
pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=10, max_features=5000, stop_words=stopwords.words('english'))),
    ('tfidf', TfidfTransformer()),
    ('dim_reduction', TruncatedSVD()),
    ('clf', SVM())
])

parameters = {
    'vect__max_df': (0.6, 0.7),
    'vect__ngram_range': ((1, 2), (1,3)),  
    'tfidf__use_idf': (True, False),
    'dim_reduction__n_components': (100,500,1000),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C':(1.0, 0.1, 0.01),
    'clf__kernel':('linear','sigmoid')
}

clf = GridSearchCV(pipeline, parameters, scoring='f1', cv=3, n_jobs=-1)
clf.fit(x_train, y_train)
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)