# Twitter Sentiment Analysis Report Notebook

## Loading the Sentiment140 dataset containing 1.6M Tweets

In [1]:
import pandas as pd
df = pd.read_csv("training.1600000.processed.noemoticon.csv", header= None)

#### Understanding the data

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   0       1600000 non-null  int64 
 1   1       1600000 non-null  int64 
 2   2       1600000 non-null  object
 3   3       1600000 non-null  object
 4   4       1600000 non-null  object
 5   5       1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


#### Hence, no missing value is present in the data.

In [3]:
#Number of total tweets in the dataset
len(df)

1600000

In [4]:
#Number of positive tweets in the dataset
len(df[df[0]==0])

800000

In [5]:
#Number of negative tweets in the dataset
len(df[df[0]==4])

800000

This dataset is balanced into two categories- positive and negative. 
No neutral labelling is present in this dataset. So, We need to prepare the dataset according to our objective

#### Renaming Columns

In [6]:
df.columns=['target','user_id','date','query','user_name','tweet']
df

Unnamed: 0,target,user_id,date,query,user_name,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


#### Dropping All other columns except the tweet 

In [7]:
df=df.drop(['target','user_id','date','query','user_name'],axis=1)
df

Unnamed: 0,tweet
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."
...,...
1599995,Just woke up. Having no school is the best fee...
1599996,TheWDB.com - Very cool to hear old Walt interv...
1599997,Are you ready for your MoJo Makeover? Ask me f...
1599998,Happy 38th Birthday to my boo of alll time!!! ...


#### Cleaning the Text data

In [8]:
import re
from string import punctuation

In [9]:
emoji_pattern = re.compile("["
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

def replace_emojis(t):
    emoji_happy = ["\U0001F600","\U0001F601", "\U0001F602","\U0001F603","\U0001F604","\U0001F605",
                       "\U0001F606","\U0001F607","\U0001F609","\U0001F60A", "\U0001F642","\U0001F643","\U0001F923",r"\U0001F970","\U0001F60D", r"\U0001F929","\U0001F618","\U0001F617",
                       r"\U000263A", "\U0001F61A", "\U0001F619", r"\U0001F972", "\U0001F60B", "\U0001F61B", "\U0001F61C", r"\U0001F92A",
                       "\U0001F61D", "\U0001F911", "\U0001F917", r"\U0001F92D", r"\U0001F92B","\U0001F914","\U0001F910", r"\U0001F928", "\U0001F610", "\U0001F611",
                       "\U0001F636", "\U0001F60F","\U0001F612", "\U0001F644","\U0001F62C","\U0001F925","\U0001F60C","\U0001F614","\U0001F62A",
                       "\U0001F924","\U0001F634", "\U0001F920", r"\U0001F973", r"\U0001F978","\U0001F60E","\U0001F913", r"\U0001F9D0"]
    emoji_sad = ["\U0001F637","\U0001F912","\U0001F915","\U0001F922", r"\U0001F92E","\U0001F927", r"\U0001F975", r"\U0001F976", r"\U0001F974",
                       "\U0001F635", r"\U0001F92F", "\U0001F615","\U0001F61F","\U0001F641", r"\U0002639","\U0001F62E","\U0001F62F","\U0001F632",
                       "\U0001F633", r"\U0001F97A","\U0001F626","\U0001F627","\U0001F628","\U0001F630","\U0001F625","\U0001F622","\U0001F62D",
                       "\U0001F631","\U0001F616","\U0001F623"	,"\U0001F61E","\U0001F613","\U0001F629","\U0001F62B", r"\U0001F971",
                       "\U0001F624","\U0001F621","\U0001F620", r"\U0001F92C","\U0001F608","\U0001F47F","\U0001F480", r"\U0002620"]

    words = t.split()
    reformed = []
    for w in words:
        if w in emoji_happy:
            reformed.append("happy")
        elif w in emoji_sad:
            reformed.append("sad") 
        else:
            reformed.append(w)
    t = " ".join(reformed)
    return t


def replace_smileys(t):
    emoticons_happy = set([':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}', ':D',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)', '<3'])

    emoticons_sad = set([':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('])  

    words = t.split()
    reformed = []
    for w in words:
        if w in emoticons_happy:
            reformed.append("happy")
        elif w in emoticons_sad:
            reformed.append("sad") 
        else:
            reformed.append(w)
    t = " ".join(reformed)
    return t

def replace_contractions(t):
    cont = {"aren't" : 'are not', "can't" : 'cannot', "couln't": 'could not', "didn't": 'did not', "doesn't" : 'does not',
  "hadn't": 'had not', "haven't": 'have not', "he's" : 'he is', "she's" : 'she is', "he'll" : "he will", 
  "she'll" : 'she will',"he'd": "he would", "she'd":"she would", "here's" : "here is", 
   "i'm" : 'i am', "i've"	: "i have", "i'll" : "i will", "i'd" : "i would", "isn't": "is not", 
   "it's" : "it is", "it'll": "it will", "mustn't" : "must not", "shouldn't" : "should not", "that's" : "that is", 
   "there's" : "there is", "they're" : "they are", "they've" : "they have", "they'll" : "they will",
   "they'd" : "they would", "wasn't" : "was not", "we're": "we are", "we've":"we have", "we'll": "we will", 
   "we'd" : "we would", "weren't" : "were not", "what's" : "what is", "where's" : "where is", "who's": "who is",
   "who'll" :"who will", "won't":"will not", "wouldn't" : "would not", "you're": "you are", "you've":"you have",
   "you'll" : "you will", "you'd" : "you would", "mayn't" : "may not"}
    words = t.split()
    reformed = []
    for w in words:
        if w in cont:
            reformed.append(cont[w])
        else:
            reformed.append(w)
    t = " ".join(reformed)
    return t  

def remove_single_letter_words(t):
    words = t.split()
    reformed = []
    for w in words:
        if len(w) > 1:
            reformed.append(w)
    t = " ".join(reformed)
    return t  

def cleantweet(t):
    # replace handwritten emojis with their feeling associated
    t = replace_smileys(t)
    
    # convert to lowercase
    t = t.lower() 
    
    # replace short forms used in english  with their actual words
    t = replace_contractions(t) 
    
    # replace unicode emojis with their feeling a@ashchanchlanissociated
    t = replace_emojis(t) 
    
    # remove emojis other than smiley emojis
    t = emoji_pattern.sub(r'', t) 
    
    # remove NON- ASCII characters
    t = re.sub('\\\\u[0-9A-Fa-f]{4}','', t) 
    
    # remove numbers # re.sub("\d+", "", t)
    t = re.sub("[0-9]", "", t)
    
    # remove '#'
    t = re.sub('#', '', t) 
    
    # remove '@'
    t = re.sub('@[A-Za-z0–9]+', '', t) 
    
    # remove usernames
    t = re.sub('@[^\s]+', '', t) 
    
    # remove retweet 'RT'
    t = re.sub('RT[\s]+', '', t)
    
    # remove links (URLs/ links)
    t = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', t) 
    
    # remove punctuations
    t = re.sub('[!"$%&\'()*+,-./:@;<=>?[\\]^_`{|}~]', '', t) 
    
    t = t.replace('\\\\', '')
    t = t.replace('\\', '')
    
    # removes single letter words
    t = remove_single_letter_words(t)
    
    return t

In [10]:
df['text'] = df['tweet'].apply(cleantweet)
df

Unnamed: 0,tweet,text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that is bummer you shoulda got david carr...
1,is upset that he can't update his Facebook by ...,is upset that he cannot update his facebook by...
2,@Kenichan I dived many times for the ball. Man...,dived many times for the ball managed to save ...
3,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all....",no it is not behaving at all am mad why am her...
...,...,...
1599995,Just woke up. Having no school is the best fee...,just woke up having no school is the best feel...
1599996,TheWDB.com - Very cool to hear old Walt interv...,thewdbcom very cool to hear old walt interviews
1599997,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover ask me fo...
1599998,Happy 38th Birthday to my boo of alll time!!! ...,happy th birthday to my boo of alll time tupac...


#### Preprocessing the text by tokenizing the tweet , then removing all known stopwords, then stemming and lemmatizing

In [11]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mainak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mainak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Storing all English stopwords

In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
def preprocess_tweet(tweet):
    
    #remove stopwords by tokenizing
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [ word for word in tweet_tokens if word not in stop_words]
    
    #Stemming 
    ps = PorterStemmer()
    stemmed_words = [ ps.stem(w) for w in filtered_words]
    
    #Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemma_words = [ lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(lemma_words)

In [14]:
df['text'] = df['text'].apply(preprocess_tweet)
df

Unnamed: 0,tweet,text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer shoulda got david carr third day
1,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,@Kenichan I dived many times for the ball. Man...,dive mani time ball manag save rest go bound
3,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,"@nationwideclass no, it's not behaving at all....",behav mad see
...,...,...
1599995,Just woke up. Having no school is the best fee...,woke school best feel ever
1599996,TheWDB.com - Very cool to hear old Walt interv...,thewdbcom cool hear old walt interview
1599997,Are you ready for your MoJo Makeover? Ask me f...,readi mojo makeov ask detail
1599998,Happy 38th Birthday to my boo of alll time!!! ...,happi th birthday boo alll time tupac amaru sh...


#### Preparing the Dataset using TextBlob library

In [15]:
from textblob import TextBlob
df[['polarity', 'subjectivity']] = df['text'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))
df

Unnamed: 0,tweet,text,polarity,subjectivity
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer shoulda got david carr third day,0.200,0.450
1,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...,0.000,0.000
2,@Kenichan I dived many times for the ball. Man...,dive mani time ball manag save rest go bound,0.000,0.000
3,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire,0.200,0.400
4,"@nationwideclass no, it's not behaving at all....",behav mad see,-0.625,1.000
...,...,...,...,...
1599995,Just woke up. Having no school is the best fee...,woke school best feel ever,1.000,0.300
1599996,TheWDB.com - Very cool to hear old Walt interv...,thewdbcom cool hear old walt interview,0.225,0.425
1599997,Are you ready for your MoJo Makeover? Ask me f...,readi mojo makeov ask detail,0.000,0.000
1599998,Happy 38th Birthday to my boo of alll time!!! ...,happi th birthday boo alll time tupac amaru sh...,0.000,0.000


In [16]:
def labelling(pol):
    if pol>=0.1:
        return 4
    elif pol<=-0.1:
        return 0
    else:
        return 2

In [17]:
df['sentiment']=df['polarity'].apply(labelling)
df

Unnamed: 0,tweet,text,polarity,subjectivity,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer shoulda got david carr third day,0.200,0.450,4
1,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...,0.000,0.000,2
2,@Kenichan I dived many times for the ball. Man...,dive mani time ball manag save rest go bound,0.000,0.000,2
3,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire,0.200,0.400,4
4,"@nationwideclass no, it's not behaving at all....",behav mad see,-0.625,1.000,0
...,...,...,...,...,...
1599995,Just woke up. Having no school is the best fee...,woke school best feel ever,1.000,0.300,4
1599996,TheWDB.com - Very cool to hear old Walt interv...,thewdbcom cool hear old walt interview,0.225,0.425,4
1599997,Are you ready for your MoJo Makeover? Ask me f...,readi mojo makeov ask detail,0.000,0.000,2
1599998,Happy 38th Birthday to my boo of alll time!!! ...,happi th birthday boo alll time tupac amaru sh...,0.000,0.000,2


In [18]:
# Number of Positive Tweets
len(df[df['sentiment']==4])

539818

In [19]:
# Number of Neutral Tweets
len(df[df['sentiment']==2])

845226

In [20]:
# Number of Negative Tweets
len(df[df['sentiment']==0])

214956

In [21]:
y = df['sentiment']
print(y.shape)

(1600000,)


#### Vectorization of text data using Tf-idf Vectorizer

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(df['text'])

#### Splitting the data into train and test

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=41,shuffle=True,stratify=y)

In [24]:
import dask.array as da
classes = da.unique(y).compute()
classes

array([0, 2, 4], dtype=int64)

### Model Training & Evaluation

#### SGD Classifier

In [25]:
from sklearn.linear_model import SGDClassifier
from dask_ml.wrappers import Incremental
est = SGDClassifier(loss='log', penalty='l2', tol=0e-3)
inc = Incremental(est, scoring='accuracy')
for _ in range(10):
    inc.partial_fit(X_train, y_train, classes=classes)
    print('Score:', inc.score(X_test, y_test))

Score: 0.8284354166666666
Score: 0.8272416666666667
Score: 0.8283875
Score: 0.82815
Score: 0.8281729166666667
Score: 0.8279416666666667
Score: 0.8280916666666667
Score: 0.8280958333333334
Score: 0.82811875
Score: 0.8281229166666667


#### Multinomial Naive-Bayes Algorithm

In [26]:
from sklearn.naive_bayes import MultinomialNB
from dask_ml.wrappers import Incremental
mnb = MultinomialNB()
inc2 = Incremental(mnb, scoring='accuracy')
for _ in range(10):
    inc2.partial_fit(X_train, y_train, classes=classes)
    print('Score:', inc2.score(X_test, y_test))

Score: 0.7365145833333333
Score: 0.7486979166666666
Score: 0.753425
Score: 0.755725
Score: 0.7569895833333333
Score: 0.7579145833333333
Score: 0.7585354166666667
Score: 0.7590020833333333
Score: 0.75928125
Score: 0.7595041666666666


#### Bernoulli Naive-Bayes Algorithm

In [27]:
from sklearn.naive_bayes import BernoulliNB
from dask_ml.wrappers import Incremental
bnb = BernoulliNB()
inc3 = Incremental(bnb, scoring='accuracy')
for _ in range(10):
    inc3.partial_fit(X_train, y_train, classes=classes)
    print('Score:', inc3.score(X_test, y_test))

Score: 0.82799375
Score: 0.8483145833333333
Score: 0.8530666666666666
Score: 0.85436875
Score: 0.8544395833333334
Score: 0.8542041666666667
Score: 0.8537125
Score: 0.85314375
Score: 0.85259375
Score: 0.8522354166666667


#### Perceptron Model

In [28]:
from sklearn.linear_model import Perceptron
from dask_ml.wrappers import Incremental
per = Perceptron()
inc4 = Incremental(per, scoring='accuracy')
for _ in range(10):
    inc4.partial_fit(X_train, y_train, classes=classes)
    print('Score:', inc4.score(X_test, y_test))

Score: 0.94745
Score: 0.9484541666666667
Score: 0.9499958333333334
Score: 0.95155625
Score: 0.9510979166666667
Score: 0.9514583333333333
Score: 0.9529291666666667
Score: 0.95285625
Score: 0.9527020833333333
Score: 0.9520270833333333


### Incremental learning on Perceptron Model gives 95.20% accuracy

#### Training SVM Model on 1 Lakh Data

In [29]:
df2 = df[:100000]

In [30]:
vec2 = TfidfVectorizer()
X = vec2.fit_transform(df2['text'])

In [31]:
y= df2['sentiment']

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=41,shuffle=True,stratify=y)

In [33]:
from sklearn import svm
# classify using support vector classifier
svm = svm.SVC(kernel = 'linear', probability=True)

# fit the SVC model based on the given training data
svm.fit(X_train,y_train)

SVC(kernel='linear', probability=True)

In [34]:
y_pred = svm.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9554


### Support Vector classifier gives 95.54% Accuracy

### Exporting Models 

In [36]:
import pickle

#### Exporting Support Vector Classifier Model & Vectorizer

In [37]:
pickle.dump(svm, open('posneg_lac.pkl', 'wb'))
pickle.dump(vec2, open('vec_lac.pkl', 'wb'))

In [38]:
pickle.dump(inc4, open('posneg_full.pkl', 'wb'))
pickle.dump(vec, open('vec_full.pkl', 'wb'))