In [None]:
# Import all required packages

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.utils import shuffle

import re
import nltk
nltk.download('stopwords')
import time
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import keras 
from keras.models import Sequential, Model 
from keras import layers
from keras.layers import Dense, Dropout, Input, Embedding

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Read in train data into a dataframe
data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding="latin1", header=None)
print(data.shape)
data.head(5)

(1600000, 6)


Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
# Count the number of tweets per sentiment in train Data

print(data[0].value_counts())
print("total ", len(data))

0    800000
4    800000
Name: 0, dtype: int64
total  1600000


In [None]:
# Data Preprocess Function

def dataPreprocess(text):

    emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    text = re.sub(u"(\u2018|\u2019|u2018|u2019|u002)", "'", text)

    url_pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = url_pattern.sub(r'', text)

    taguser_pattern = re.compile('@(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = taguser_pattern.sub(r'', text)

    contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
    specials = ["’", "‘", "´", "`", "'"]
    for s in specials:
        text = text.replace(s, "'")

    for key in contraction_mapping:
      text = text.lower()
      text = text.replace(key, contraction_mapping[key])


    textArr = text.split()
    text = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>2))])

    text = re.sub('[^a-zA-Z]',' ',text) 

    text = text.lower()

    text = text.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    text = [ps.stem(word) for word in text if not word in set(all_stopwords)]

    return ' '.join(text)

In [None]:
print(dataPreprocess('@misstoriblack cool , i have no tweet apps  for my razr 2'))

cool tweet app razr


In [None]:
inputData = shuffle(data,random_state=42)
inputData = inputData[1:700000]

In [None]:
inputData[0].value_counts()

4    350003
0    349996
Name: 0, dtype: int64

In [None]:
# Data Proprocessing

inputData[5] = inputData[5].apply(lambda x: dataPreprocess(x))

In [None]:
inputData.to_csv("/content/preprocessed_data_semeval700000.csv", index=False)

In [None]:
dataset_cols = ["target", "ids", "date", "flag", "user", "text"]
preprocessed_data = pd.read_csv('/content/preprocessed_data_semeval700000.csv', names=dataset_cols)

# Remove null values from Dataframe
preprocessed_data = preprocessed_data.dropna()

print(preprocessed_data)

        target         ids                          date      flag  \
0            0           1                             2         3   
1            0  1467998485  Mon Apr 06 23:11:14 PDT 2009  NO_QUERY   
2            0  2300048954  Tue Jun 23 13:40:11 PDT 2009  NO_QUERY   
3            0  1993474027  Mon Jun 01 10:26:07 PDT 2009  NO_QUERY   
4            0  2256550904  Sat Jun 20 12:56:51 PDT 2009  NO_QUERY   
...        ...         ...                           ...       ...   
699995       0  1975106381  Sat May 30 14:27:40 PDT 2009  NO_QUERY   
699996       4  1974413652  Sat May 30 13:05:16 PDT 2009  NO_QUERY   
699997       0  2252096148  Sat Jun 20 05:26:36 PDT 2009  NO_QUERY   
699998       4  1970196461  Sat May 30 02:51:14 PDT 2009  NO_QUERY   
699999       0  2263008538  Sat Jun 20 23:44:17 PDT 2009  NO_QUERY   

                   user                                               text  
0                     4                                                  5  
1    

In [None]:
# Get Y value (Predicted values)

y = preprocessed_data['target']
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# Split train-test dataset

X_train, X_test, y_train, y_test = train_test_split(preprocessed_data['text'], y, test_size = 0.15, random_state = 0)

In [None]:
# Transform text into vector using TFIDF

tfidf = TfidfVectorizer(max_features = 600)
X_train_tf = tfidf.fit_transform(X_train).toarray() 
X_test_tf = tfidf.transform(X_test).toarray()

In [None]:
X_train_tf.shape, X_test_tf.shape, y_train.shape, y_test.shape

((591695, 600), (104417, 600), (591695,), (104417,))

In [None]:
# Multinomial Distribution

from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()

y_pred = MNB.fit(X_train_tf, y_train).predict(X_test_tf)

In [None]:
MNBscore = accuracy_score(y_test, y_pred)
print(str('Total Accuracy {:04.2f}'.format((MNBscore)* 100))+'%')

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

Total Accuracy 72.83%
Confusion Matrix:
 [[38834 13525]
 [14843 37215]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.74      0.73     52359
           1       0.73      0.71      0.72     52058

    accuracy                           0.73    104417
   macro avg       0.73      0.73      0.73    104417
weighted avg       0.73      0.73      0.73    104417



In [None]:
# Complement Distribution

from sklearn.naive_bayes import ComplementNB

CNB = ComplementNB()

y_pred = CNB.fit(X_train_tf, y_train).predict(X_test_tf)

In [None]:
CNBscore = accuracy_score(y_test, y_pred)
print(str('Total Accuracy {:04.2f}'.format((CNBscore)* 100))+'%')

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

Total Accuracy 72.83%
Confusion Matrix:
 [[38827 13532]
 [14836 37222]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.74      0.73     52359
           1       0.73      0.72      0.72     52058

    accuracy                           0.73    104417
   macro avg       0.73      0.73      0.73    104417
weighted avg       0.73      0.73      0.73    104417



In [None]:
# Bernoulli Distribution

from sklearn.naive_bayes import BernoulliNB

BNB = BernoulliNB()

y_pred = BNB.fit(X_train_tf, y_train).predict(X_test_tf)

In [None]:
BNBscore = accuracy_score(y_test, y_pred)
print(str('Total Accuracy {:04.2f}'.format((BNBscore)* 100))+'%')

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

Total Accuracy 73.82%
Confusion Matrix:
 [[37257 15102]
 [12236 39822]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.71      0.73     52359
           1       0.73      0.76      0.74     52058

    accuracy                           0.74    104417
   macro avg       0.74      0.74      0.74    104417
weighted avg       0.74      0.74      0.74    104417



In [None]:
import matplotlib.image  as mpimg

acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc))

plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.title('Training and validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Accuracy", "Validation Accuracy"])

plt.figure()

plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])

plt.figure()

In [None]:
# Gaussian Distribution

from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()

y_pred = GNB.fit(X_train_tf, y_train).predict(X_test_tf)

In [None]:
GNBscore = accuracy_score(y_test, y_pred)
print(str('Total Accuracy {:04.2f}'.format((GNBscore)* 100))+'%')

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

Total Accuracy 70.31%
Confusion Matrix:
 [[33842 18517]
 [12480 39578]]
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.65      0.69     52359
           1       0.68      0.76      0.72     52058

    accuracy                           0.70    104417
   macro avg       0.71      0.70      0.70    104417
weighted avg       0.71      0.70      0.70    104417



In [None]:
# Calculated Accuracy

print(str('MultinomialNB {:04.2f}'.format((MNBscore)* 100))+'%')
print(str('ComplementNB {:04.2f}'.format((CNBscore)* 100))+'%')
print(str('BernoulliNB {:04.2f}'.format((BNBscore)* 100))+'%')
print(str('GaussianNB {:04.2f}'.format((GNBscore)* 100))+'%')


MultinomialNB 72.83%
ComplementNB 72.83%
BernoulliNB 73.82%
GaussianNB 70.31%
