In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Load dataset and preprocess text

In [3]:
df = pd.read_csv('Sentiment Analysis Dataset.csv')

In [4]:
import re
import spacy
from spacy.lang.en import English

#### Removing punctuation
# function to remove special characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z!?\'\s]' 
    return re.sub(pat, '', text)


#### Lemmatization
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_sm')

# function to lemmatize words
def get_lem(text):
    text = nlp(text)


    text = [word.lemma_.lower() if word.lemma_ != '-PRON-' else word.text for word in text]
    text_without_space = [word for word in text if not word.isspace()]

    return ' '.join(text_without_space)

# combine above two functions
def cleanup(text):
    text = remove_special_characters(text)
    text = get_lem(text)

    return text

### If there is no 'Text.csv', normalize text in dataset and save into 'Text.csv'

In [None]:
for i in range(int(len(df)/5000)):
    if (i+1)*5000 < len(df):
        df['Text_normalized'][i*5000:(i+1)*5000] = df['SentimentText'][i*5000:(i+1)*5000].apply(cleanup).values
        print('Finish {} texts'.format((i+1)*5000))
    else: 
        df['Text_normalized'][i*5000:] = df['SentimentText'][i*5000:].apply(cleanup).values
        print('It is done!')
df.to_csv('Text.csv')

### If 'Text.csv' exists, load csv:

In [5]:
df_text = pd.read_csv('Text.csv')

text_data = df_text.loc[:10000,'Text_normalized'].values # features for training and testing
text_label = df.loc[:10000,'Sentiment'].values # labels for training and testing
# on my computer when large amount of dataset is loaded, the computer crashed

In [6]:
from sklearn.model_selection import train_test_split

# split features and labels into two datasets, namely training dataset and test dataset
X_train, X_test, y_train, y_test = train_test_split(text_data, text_label, test_size=0.1, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
X_train[0]

(9000,)
(9000,)
(1001,)
(1001,)


'a baby fall flat on his face and start bawl because of i forget that he be wobbley !'

# 2. Convert text into feature vectors

In [7]:
# apply 'bag of words' method to convert a sentence into a feature vector
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train) # learning the vocabulary dictionary and return a matrix
X_train_counts.shape

(9000, 16007)

In [8]:
# apply term frequency(TF) times inverse document frequency(IDF) to avoid imbalanced weights od words in longer sentence from shorted sentence.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(9000, 16007)

# 3. Build models

### We use two simple classification models implemented in scikit-learn
- Navie Bayes
- Support Vector Machine

## 3.1 Naive Bayes 'GaussianNB()'

In [18]:
#training
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf_nb = make_pipeline(StandardScaler(), GaussianNB())

clf_nb.fit(X_train_tfidf.toarray(), y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('gaussiannb', GaussianNB(priors=None, var_smoothing=1e-09))],
         verbose=False)

In [19]:
# Prediction of GaussianNB
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
predicted_nb = clf_nb.predict(X_test_tfidf.toarray())
np.mean(predicted_nb == y_test)

0.6493506493506493

### Naive Bayes gives 64.9 % accuracy when tested on 1001 samples

## 3.2 Support Vector Machine

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
# we are using linear SVM
clf_sgd = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge', max_iter=500, tol=1e-3))
clf_sgd.fit(X_train_tfidf.toarray(), y_train)
predicted_sgd = clf_sgd.predict(X_test_tfidf.toarray())
np.mean(predicted_sgd == y_test)

0.6823176823176823

### In contrast to NB, SGD classifer obtains a higher accurancy of 68.2 % on 1001 samples


# 4. Evaluation of performance

## 4.1 Naive Bayes Performance

In [20]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted_nb, target_names=['Positive','Negative']))

              precision    recall  f1-score   support

    Positive       0.65      0.86      0.74       585
    Negative       0.64      0.35      0.45       416

    accuracy                           0.65      1001
   macro avg       0.65      0.61      0.60      1001
weighted avg       0.65      0.65      0.62      1001



In [21]:
metrics.confusion_matrix(y_test, predicted_nb)

array([[505,  80],
       [271, 145]])

## 4.2 SVM Performance

In [16]:
print(metrics.classification_report(y_test, predicted_sgd, target_names=['Positive','Negative']))

              precision    recall  f1-score   support

    Positive       0.66      0.93      0.77       585
    Negative       0.78      0.33      0.46       416

    accuracy                           0.68      1001
   macro avg       0.72      0.63      0.62      1001
weighted avg       0.71      0.68      0.65      1001



In [17]:
metrics.confusion_matrix(y_test, predicted_sgd)

array([[545,  40],
       [278, 138]])

### From the perspective of f1 score, f1 score of SGDClassifier is bigger than the other and closer to 1, which also indicates the latter has a better performance

# 5 Parameter tuning

### When it comes to model performance, it also depends on the parameter used to build the models, like penality in SGDClassifier. The penality have an impact on the training process and can lead to better generalization. 

In [24]:
from sklearn.model_selection import GridSearchCV
# we use grid search to select best parameters
parameters = {
    'alpha': (1e-2, 1e-3),
    'loss': ('hinge', 'log')
}
gs_clf = GridSearchCV(clf_sgd, parameters, cv=5, n_jobs=-1) 
gs_clf = gs_clf.fit(X_train_tfidf[:5000], y_train[:400]) # return best score and best paramters

# 6 Next steps

## - the very first step shoud be making use of full dataset. 

I assume that memory on my computer is very limited, as the sign 'Your disk is almost full' keeps showing up. The dataset can not be processed in memory

## - Emsemble models. 

As the performance of the simple classification model is not good and accurancy is low, we definitely take a little complext models into account. When it comes to complexity, one way to improve the performance is to train many simple models and put them together. In this way, the variance of each model is likely to be reduced.

## - Neural networks. 

There is no doubt that we should use neural network to improve the performance, Like LSTM. In this case, logistic function can be used as loss function and Adam with L2 can be used as optimization method. The performance must be promising.

## - Pretrained models. 

Obvioudly, there are models trained specifically for NLP tasks. In spacy or NLTK, people can load these pretrained models to do classification and finetune the parameters based on dataset we have.

## - Production. 

Once we have a model which is trained well, we might consider to give others the access to the model. In this case, as far as I know, building a api, like by using FastAPI, will be a good starting point.

## - Monitoring.

Building a model can not be a final point. We should make use of the performance and feedback from people who are using the model to improve the model. 