## Sentimental Analysis on Twitter Tweets ##

In [1]:
# Import necessary libraries.
import pandas as pd
import re
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

import utils

In [2]:
# Read the Dataframe which has 1.6 million tweets
df = pd.read_csv('./sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin', header=None, 
                 names=["polarity", "id", "date", "flag", "user", "tweet"])
df.head()

Unnamed: 0,polarity,id,date,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
# Remove Unnecessary columns
tweet_dataset = df.drop(["id","flag","date","user"], axis = 1)
tweet_dataset.head()

Unnamed: 0,polarity,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [4]:
# The original label values are 0 and 4. Therefore replace 4 with 1.
tweet_dataset.polarity = tweet_dataset.polarity.replace(4,1)
tweet_dataset.polarity.value_counts()

1    800000
0    800000
Name: polarity, dtype: int64

In [5]:
tweet_dataset['tweet'] = tweet_dataset['tweet'].apply(utils.preprocess_tweet)
tweet_dataset.head()


Unnamed: 0,polarity,tweet
0,0,"AT_USER URL - Awww, that's a bummer. You shoul..."
1,0,is upset that he can't update his Facebook by ...
2,0,AT_USER I dived many times for the ball. Manag...
3,0,my whole body feels itchy and like its on fire
4,0,"AT_USER no, it's not behaving at all. i'm mad...."


In [6]:
features = utils.extract_features(tweet_dataset)
print(" Shape of the Feature Vector :: ", features.shape)

 Shape of the Feature Vector ::  (1600000, 286175)


In [7]:
X_train, X_test, y_train, y_test  = train_test_split(features, tweet_dataset.polarity, test_size=0.2, random_state=42)

In [8]:
# 'bayes' 
bayes_model = utils.init_classifier("bayes", None)
bayes_model.fit(X_train, y_train)

print('bayes train acc :: ', bayes_model.score(X_train, y_train[:,None]))
print('bayes test acc :: ', bayes_model.score(X_test, y_test[:,None]))

bayes train acc ::  0.7977984375
bayes test acc ::  0.763603125


In [10]:
# 'logistic'
lr_model = utils.init_classifier("logistic", 1.)
lr_model.fit(X_train, y_train)

print('Logistic Regression train acc :: ', lr_model.score(X_train, y_train[:,None]))
print('Logistic Regression test acc :: ', lr_model.score(X_test, y_test[:,None]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression train acc ::  0.79734296875
Logistic Regression test acc ::  0.77945
