[View in Colaboratory](https://colab.research.google.com/github/SalmaAhmed95/Twitter-Sentiment-Classification-Using-Distant-Supervision/blob/master/TwitterSentimentAnalysis_Neural_Networks_work.ipynb)

In [1]:
# This cell connects to google drive, authenticates connection, and iterates over file list displaying each file's title and ID.
# it also imports and installs all the necessary libraries
!pip install -U -q PyDrive

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from time import time

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_list = drive.ListFile(
    {'q': "'1UHoxe2J0_QVgWD1P4BdF94CVALiHn3S5' in parents"}).GetList()

for f in file_list:
  # 3. Create & download by id.
  print('title: %s, id: %s' % (f['title'], f['id']))
  



title: TwitterSentimentAnalysis.ipynb, id: 1A9doDERyDk5qz9Hy0LykCXeQdqKvmlMn
title: clean_Tweets_1600000Tweet.csv, id: 1bFJu26UFNgkYU0u0hANXAfvfz3-L-2Ry
title: clean_Tweets_1.6millionTweet.csv, id: 1auKc_BBHVk-FQJkOIyLrlCS-ivRW3fIx
title: training.1600000.processed.noemoticon.csv, id: 1XdARv07vjTvdAPHgwFrMobo-ivCaO8U-
title: Arabic Dataset Collection, id: 1sBc-KW9HQ6pf5C9u-jiSYTt6CvkQP9Kl00GQOMmlvc0
title: Research Implementation plan, id: 1DSyAZ2xmklZEys_xtWf1x_YUUj6WQN21QSuzQmzN9B4
title: Twitter Sentiment Classification using Distant Supervision, id: 1g2M3uMgD9wainP6iiNZps6H3x8HXzeClr36QAdT5X5k


In [2]:
# Reading clean data from drive and saving it locally in memory
clean_data = drive.CreateFile({'id': '1bFJu26UFNgkYU0u0hANXAfvfz3-L-2Ry'})
clean_data.GetContentFile('clean_Tweets_1600000Tweet')  

# upload file into pandas dataframe and drop unnecessary columns and neutral rows

# Using columns 1,2 because col 0 is the number of the row, could be changed if the row number is removed from the csv file.
df_clean = pd.read_csv('clean_Tweets_1600000Tweet', usecols = [1,2], encoding='latin-1')

df_clean

Unnamed: 0,text,target
0,"- aww, that's a bummer. you shoulda got dav...",0
1,is upset that he can't update his facebook by ...,0
2,i dived many times for the ball. managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it's not behaving at all. i'm mad. why am...",0
5,not the whole crew,0
6,need a hug,0
7,"hey long time no see! yes.. rains a bit ,onl...",0
8,nope they didn't have it,0
9,que me muera ?,0


In [3]:
df_clean.loc[df_clean['target'] == 4, 'target'] = 1
# df.loc[df['First Season'] > 1990, 'First Season'] = 1
df_clean

Unnamed: 0,text,target
0,"- aww, that's a bummer. you shoulda got dav...",0
1,is upset that he can't update his facebook by ...,0
2,i dived many times for the ball. managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it's not behaving at all. i'm mad. why am...",0
5,not the whole crew,0
6,need a hug,0
7,"hey long time no see! yes.. rains a bit ,onl...",0
8,nope they didn't have it,0
9,que me muera ?,0


In [4]:
# Checking read cleaned file info if there are any null entries (found none) or white space
df_clean.info()
df_clean['text'] = df_clean['text'].str.strip()
df_clean.drop(df_clean[df_clean.text == ''].index, inplace=True)
df_clean

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
text      1600000 non-null object
target    1600000 non-null int64
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


Unnamed: 0,text,target
0,"- aww, that's a bummer. you shoulda got david...",0
1,is upset that he can't update his facebook by ...,0
2,i dived many times for the ball. managed to sa...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it's not behaving at all. i'm mad. why am ...",0
5,not the whole crew,0
6,need a hug,0
7,"hey long time no see! yes.. rains a bit ,only...",0
8,nope they didn't have it,0
9,que me muera ?,0


In [5]:
# Splitting data into train/validation/test sets
x = df_clean.text
y = df_clean.target

SEED = 2000

#split clean data to two sets : training , validation and test
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
#split the second set into two other sets : validation , test 
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)


print ("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),(len(x_train[y_train == 0]) / (len(x_train)*1.))*100,(len(x_train[y_train == 1]) / (len(x_train)*1.))*100))

print ("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),(len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,(len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))

print ("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),(len(x_test[y_test == 0]) / (len(x_test)*1.))*100,(len(x_test[y_test == 1]) / (len(x_test)*1.))*100))


Train set has total 1565232 entries with 49.98% negative, 50.02% positive
Validation set has total 15972 entries with 50.83% negative, 49.17% positive
Test set has total 15972 entries with 50.08% negative, 49.92% positive


In [0]:
# Some setup for matplotlib
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [7]:
# From previous results, using 100k features with trigrams from tfidf vectorizer as it produced best validation accuracy
tvec1 = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
tvec1.fit(x_train)



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [0]:
# Transforming our tfidf vectorizer for the training set
x_train_tfidf = tvec1.transform(x_train)

In [0]:
# Transforming our tfidf vectorizer for the validation set
x_validation_tfidf = tvec1.transform(x_validation)

In [10]:
# Training 
%%time
clf = LogisticRegression()
clf.fit(x_train_tfidf, y_train)

CPU times: user 1min 29s, sys: 199 ms, total: 1min 29s
Wall time: 1min 29s


In [11]:
# Validation set accuracy
clf.score(x_validation_tfidf, y_validation)

0.8273228149261207

In [12]:
# Training set accuracy
clf.score(x_train_tfidf, y_train)

0.8418234485367025

In [14]:
# Test set accuracy
x_test_tfidf = tvec1.transform(x_test)
clf.score(x_test_tfidf, y_test)

0.8265715001252192

In [15]:
# Using Keras for Neural Network Model
seed = 7
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence




Using TensorFlow backend.


In [0]:
# A function which generates iterable generator object, so that it can be fed to NN model. 
def batch_generator_shuffle(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    np.random.shuffle(index)
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield X_batch,y_batch
        if (counter > number_of_batches):
            np.random.shuffle(index)
            counter=0


In [62]:
# Training tfidf vector with a NN of 100k features, using 20% dropout, 64 hidden layers, relu activation function, adam optimizer and binary cross-entropy loss function
%%time
model_s_1 = Sequential()
model_s_1.add(Dense(64, activation='relu', input_dim=100000))
model_s_1.add(Dropout(0.2))
model_s_1.add(Dense(1, activation='sigmoid'))
model_s_1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s_1.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)


Epoch 1/5
Epoch 2/5
Epoch 3/5

In [17]:
# Increasing Number of Nodes to 128 in the hidden layer instead of 64 and observe its effect on the NN Model
model_s_2 = Sequential()
model_s_2.add(Dense(128, activation='relu', input_dim=100000))
model_s_2.add(Dense(1, activation='sigmoid'))
model_s_2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s_2.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=2, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fcff81eb240>