## Sentiment Analysis on Twitter Data

### Import the movies.txt

The dataset is obtained from the [University of Michigan Kaggle Competetion](https://www.kaggle.com/c/si650winter11/data)

In [9]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [2]:
#We import the dataset and name the first column as "Sentiment" and the review as "Review"
data = pd.read_csv("movies.txt" , sep='\t',header=None,names = ["Sentiment", "Review"])

In [3]:
#We check to see if we have the correct size of the data
data.shape

(6918, 2)

In [4]:
#We use the CountVectorizer to get the features without removing the stop words
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data['Review'])
X_train_counts.shape

(6918, 2132)

#### Neural Network output without removing the stop words

In [5]:
# we split the data into test and train
X_train, X_test, y_train, y_test  = train_test_split(
        X_train_counts, 
        data.Sentiment,
        test_size=0.2,
        random_state=111)

In [6]:
#we initialize the neural network with a size of 5 and 2 and the L2
#Regularization term is 1e-5
clf = MLPClassifier(alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X=X_train, y=y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [8]:
#Checking the tarining set accuracy 
y_pred = clf.predict(X_train)
print(accuracy_score(y_train, y_pred))

1.0


#### Since the training set has a high, near perfect accuracy, we test for overfitting

In [None]:
#Cross validation to check for overfitting
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#### Neural Network output after removing stop words

In [13]:
#We remove the stop words from the dataset
count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(data['Review'])
X_train_counts.shape

(6918, 1921)

In [14]:
#We create the train and test sets again as we have changed the feature matrix
X_train, X_test, y_train, y_test  = train_test_split(
        X_train_counts, 
        data.Sentiment,
        test_size=0.2,
        random_state=1234)

In [16]:
#we initialize the neural network with a size of 5 and 2 and the L2
#Regularization term is 1e-5
clf = MLPClassifier(max_iter=1000,alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X=X_train, y=y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [18]:
#Checking the tarining set accuracy 
y_pred = clf.predict(X_train)
print(accuracy_score(y_train, y_pred))

1.0


In [19]:
#Performing a cross validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.99 (+/- 0.00)


In [20]:
##Checking the test set accuracy 
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.990606936416185


In [21]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[577,  13],
       [  0, 794]])

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       1.00      0.98      0.99       590
          1       0.98      1.00      0.99       794

avg / total       0.99      0.99      0.99      1384

