# Importing the necessary Libraries

In [1]:

import numpy as np
import pandas as pd

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import nltk
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

# Reading the Datasets

### Please change the test path with the required testing sets 

In [2]:
train_path = "twitter-training-data.txt"
test_path = "twitter-dev-data.txt"

In [3]:
train_df = pd.read_csv(train_path,sep='\t',header=None)

In [4]:
test_df = pd.read_csv(test_path,sep='\t',header=None)

In [5]:
# train_df = train_df.drop(columns=[0])

In [6]:
train_df = train_df.rename(columns={0:'ID',1: "Sentiment",2:'Tweets'})

In [7]:
test_df = test_df.rename(columns={0:'ID',1: "Sentiment",2:'Tweets'})

In [8]:
train_df.head()

Unnamed: 0,ID,Sentiment,Tweets
0,335104872099066692,positive,Felt privileged to play Foo Fighters songs on ...
1,796528524030124618,positive,@AaqibAfzaal Pakistan may be an Islamic countr...
2,760964834217238632,positive,Happy Birthday to the coolest golfer in Bali! ...
3,147713180324524046,negative,@SimpplyA TMILLS is going to Tucson! But the 2...
4,732302280474120023,negative,Hmmmmm where are the #BlackLivesMatter when ma...


# Preprocessing Data (Cleaning tweets)

In [9]:
import re


def cleanup_text(texts):
    '''
    Pre-processed the tweets and returns a clean tweets after
    replacing and removing the unwanted bits and pieces from the tweet.
    '''
    cleaned_text = []
    for text in texts:
        # remove ugly &quot and &amp
        text = re.sub(r"&quot;(.*?)&quot;", "\g<1>", text)
        text = re.sub(r"&amp;", "", text)

        # replace emoticon
        text = re.sub(
            r"(^| )(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)",
            "\g<1>TOKEMOTICON",
            text,
        )

        text = text.lower()
        text = text.replace("tokemoticon", "TOKEMOTICON")

        # replace url
        text = re.sub(
            r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
            "TOKURL",
            text,
        )

        # replace mention
        text = re.sub(r"@[\w]+", "TOKMENTION", text)

        # replace hashtag
        text = re.sub(r"#[\w]+", "TOKTAG", text)

        # replace dollar
        text = re.sub(r"\£\d+", "TOKPOUND", text)

        # remove punctuation
        text = re.sub("[^a-zA-Z0-9]", " ", text)

        # remove multiple spaces
        text = re.sub(r" +", " ", text)

        # remove newline
        text = re.sub(r"\n", " ", text)
        
        #Remove Digits
        text= re.sub('[0-9\n]',' ',text)

        cleaned_text.append(text)
    return cleaned_text

In [10]:
## Encoding Positive, Neutral and Negative to 1,0,2 respectively 
train_df['Sentiment']=train_df['Sentiment'].replace(to_replace="positive",value="1")
train_df['Sentiment']=train_df['Sentiment'].replace(to_replace="negative",value="2")
train_df['Sentiment']=train_df['Sentiment'].replace(to_replace="neutral",value="0")

In [11]:
train_df['clean']=cleanup_text(train_df['Tweets'])

In [12]:
train_df

Unnamed: 0,ID,Sentiment,Tweets,clean
0,335104872099066692,1,Felt privileged to play Foo Fighters songs on ...,felt privileged to play foo fighters songs on ...
1,796528524030124618,1,@AaqibAfzaal Pakistan may be an Islamic countr...,TOKMENTION pakistan may be an islamic country ...
2,760964834217238632,1,Happy Birthday to the coolest golfer in Bali! ...,happy birthday to the coolest golfer in bali T...
3,147713180324524046,2,@SimpplyA TMILLS is going to Tucson! But the 2...,TOKMENTION tmills is going to tucson but the ...
4,732302280474120023,2,Hmmmmm where are the #BlackLivesMatter when ma...,hmmmmm where are the TOKTAG when matters like ...
...,...,...,...,...
45021,660374218263817235,0,Sunday Cinema | Paul McCartney &amp; David Gil...,sunday cinema paul mccartney david gilmour pau...
45022,739323365061217061,0,"14-Aug The day of independence, The day of S...",aug the day of independence the day of sacr...
45023,681369726697754114,1,"9 September has arrived, which means Apple's n...",september has arrived which means apple s ne...
45024,922217029064536808,1,So I'll see you all tomorrow for some fun fill...,so i ll see you all tomorrow for some fun fill...


In [13]:
test_df['Sentiment']=test_df['Sentiment'].replace(to_replace="positive",value="1")
test_df['Sentiment']=test_df['Sentiment'].replace(to_replace="negative",value="2")
test_df['Sentiment']=test_df['Sentiment'].replace(to_replace="neutral",value="0")

# Imputing TF-IDF Vectorizer

In [14]:

from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [15]:
X=cv.fit_transform(train_df['clean']).toarray()

In [16]:
y=train_df['Sentiment']

In [17]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
x_test=cv.transform(cleanup_text(test_df['Tweets'])).toarray()

In [19]:
y_test=test_df['Sentiment']

# Multinomial Naive Bayes Classifier

In [20]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()

In [21]:
clf.fit(X,y)

MultinomialNB()

In [22]:
y_pred = clf.predict(x_test)

In [23]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy for Multinomial Naive Bayes Classifier is: ", round((metrics.accuracy_score(y_test, y_pred))*100,2), '%')

Accuracy for Multinomial Naive Bayes Classifier is:  62.75 %


# Logistic Regression Classifier (Maximum Entropy)

In [24]:
from sklearn.linear_model import LogisticRegression #Maximum Entropy

In [25]:
clf = LogisticRegression(random_state=0).fit(X, y)

y_pred = clf.predict(x_test)


In [26]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy for Logistic Regression Classifier is: ", round((metrics.accuracy_score(y_test, y_pred))*100,2), '%')

Accuracy for Logistic Regression Classifier is:  65.8 %


# Gaussian Naive Bayes Model

In [27]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X, y)

#Predict the response for test dataset
y_pred = gnb.predict(x_test)

In [28]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy for Gaussian Naive Bayes Classifier is: ", round((metrics.accuracy_score(y_test, y_pred))*100,2), '%')

Accuracy for Gaussian Naive Bayes Classifier is:  31.6 %
