# TWITTER SENTIMENT ANALYSIS
#### This notebook walks through the basics of sentiment analysis using Tweets from the 

In [None]:
# import the pandas library to read the dataset
import pandas as pd

In [None]:
!pip install tweepy
!pip install snscrape

Collecting snscrape
  Downloading snscrape-0.3.4-py3-none-any.whl (35 kB)
Installing collected packages: snscrape
Successfully installed snscrape-0.3.4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
 # importing libraries and packages
import snscrape.modules.twitter as sntwitter
import pandas

# Creating list to append tweet data 
tweets_list1 = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:jack').get_items()): #declare a username 
    if i>1000: #number of tweets you want to scrape
        break
    tweets_list1.append([tweet.date, tweet.id, tweet.content]) #declare the attributes to be returned
   
# Creating a dataframe from the tweets list above 
tweets_df1 = pd.DataFrame(tweets_list1, columns=['Datetime', 'Tweet Id', 'Text'])

tweets_df1.dropna(axis=0, inplace = True)

In [None]:
tweet

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas

# Creating list to append tweet data to
tweets_list2 = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('COVID Vaccine since:2021-01-01 until:2021-05-31').get_items()):
    if i>5000:
        break
    tweets_list2.append([tweet.date, tweet.id, tweet.content, tweet.user.username])
    
# Creating a dataframe from the tweets list above
tweets_df2 = pd.DataFrame(tweets_list2, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])

In [None]:
''' Read the dataset and display the first 5 rows
Some text in the Tweet column have characters like ™, ®, ©, and they could be turned to unwanted characters. 
encoding="ISO-8859-1" fixes this'''
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Python Updated Scripts EURO.csv", encoding="ISO-8859-1")
df.head(5)

Unnamed: 0,Datetime,Like count,Quoted Tweet,Reply,Retweet count,Retweeted,Username,Tweet,Sentiment,Tweet id
0,7/12/2021 23:55,2.0,No,No,0.0,No,Ryandavies_13,It is my birthday today<U+0001F601> #birthday ...,1.0,1.41e+18
1,7/12/2021 23:55,1.0,No,No,0.0,No,TheTycoon2,"So #EuroFinal, a penalty shootout and #England...",1.0,1.41e+18
2,7/12/2021 23:41,0.0,No,No,0.0,No,Beatlebun,guess who lost the penalties yesterday #EURO20...,1.0,1.41e+18
3,7/12/2021 23:49,0.0,No,Yes,0.0,No,cancelracismnow,@jadjaya @chiellini The number of times I said...,1.0,1.41e+18
4,7/12/2021 23:58,0.0,No,Yes,0.0,No,JRF1875,@GaryLineker @England You're a bit late with t...,0.0,1.41e+18


In [None]:
# Read the first 1000 rows
df = df.iloc[:999]
df.tail()

Unnamed: 0,Datetime,Like count,Quoted Tweet,Reply,Retweet count,Retweeted,Username,Tweet,Sentiment,Tweet id
994,7/12/2021 19:38,0.0,No,No,0.0,No,n00byz,@Savills made it on #channel4news #Euro2...,1.0,1.41e+18
995,7/12/2021 19:37,31.0,Yes,No,10.0,No,teh_jimzor,"""If you abuse anyone on social media, you're n...",1.0,1.41e+18
996,7/12/2021 19:39,1.0,No,No,1.0,No,callumowennn17,#EURO2020 #ITA vs #ENG - twitter reacts to #Eu...,1.0,1.41e+18
997,7/12/2021 19:37,3.0,Yes,No,0.0,No,LisaS_1981,"Saka, just keep your head high and please igno...",2.0,1.41e+18
998,7/12/2021 19:37,8.0,Yes,No,7.0,No,fionaboothHT,I shared your tweet at the end of Collective W...,2.0,1.41e+18


## Data preprocessing

The columns needed for this sentiment analysis are "Tweet" and "Sentiment".

The Sentiment column was hardcoded in Excel to train the model to recognize the sentiment associated with each tweet. 
0-Negative
1-Neutral
2-Positive

In [None]:
# Extract the major parameters
x = df["Tweet"]
y = df["Sentiment"]

The next step is to remove punctuations, hashtags, and stopwords like a, the, an, etc that do not affect the meaning of the tweets.


In [None]:
import nltk

In [None]:
# import libraries to help with preprocessing
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# This removes the stopwords in English Language
stop_words=stopwords.words('english')
stemmer=PorterStemmer()

In [None]:
# Next, we remove all special characters, single letters and convert them to lower case

import re
cleaned_data=[]  # a list to store all cleaned tweets
for i in range(len(x)):  # interates through every tweet
    
    tweet=re.sub('[^a-zA-Z]', ' ', x.iloc[i])  # removes all special characters
    tweet=re.sub(r'\s+[a-zA-Z]\s+', ' ', tweet)  # removes all single letters 
    tweet=tweet.lower().split()  # turns all text to lower case
    
    tweet=[stemmer.stem(word) for word in tweet if (word not in stop_words)]  # removes all stop words
    tweet=' '.join(tweet)  # joins the words to make a sentence
    cleaned_data.append(tweet) # appends all individual sentences to form a list 

In [None]:
print(cleaned_data)

## Bag of Words
Bag of words simplifies representation used in natural language processing. It creates a matrix table, where each row represents a sentence and each word will have separate column for itself that represents it’s frequency.


In [None]:
'''The Count Vectorizer function converts a list of words into bag of words
max_features is set to 3000 which means, only 3000 of the most occurring words are used to create a bag of words
stop_words is used to remove words that frequently appear in the dataset which have no sentiment'''

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=200, stop_words=["https", "euro", "final", "england"])
bag_of_words = cv.fit_transform(cleaned_data).toarray()

In [None]:
print(bag_of_words)

## Training the Model
Multinomial Naive Bayes model is used to build the NLP model using the input(x = df["Tweet"], now bag_of_words) and output(y = df["Sentiment"])

In [None]:
# Import Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()

In [None]:
# Split the dataset into train and test, then train the model

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(bag_of_words, y, test_size=0.3) # test size is 30% of the data 
model.fit(X_train, y_train)

MultinomialNB()

In [None]:
# Check the accuracy using classification_report from sklearn under important parameters like precision, recall, f1 score

y_pred = model.predict(X_test)
from sklearn.metrics import classification_report
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

         0.0       0.42      0.29      0.34        38
         1.0       0.79      0.86      0.82       207
         2.0       0.51      0.45      0.48        55

    accuracy                           0.71       300
   macro avg       0.57      0.53      0.55       300
weighted avg       0.69      0.71      0.70       300



Accuracy of the model is 68% 

In [None]:
import pickle

# save the model to disk
filename = 'finalized_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
#read the data
# Extract the major parameters
x = tweets_df1["Text"]

import nltk

# import libraries to help with preprocessing
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer

# This removes the stopwords in English Language
stop_words=stopwords.words('english')
stemmer=PorterStemmer()

# Next, we remove all special characters, single letters and convert them to lower case

import re
cleaned_data=[]  # a list to store all cleaned tweets
for i in range(len(x)):  # interates through every tweet
    
    tweet=re.sub('[^a-zA-Z]', ' ', x.iloc[i])  # removes all special characters
    tweet=re.sub(r'\s+[a-zA-Z]\s+', ' ', tweet)  # removes all single letters 
    tweet=tweet.lower().split()  # turns all text to lower case
    
    tweet=[stemmer.stem(word) for word in tweet if (word not in stop_words)]  # removes all stop words
    tweet=' '.join(tweet)  # joins the words to make a sentence
    cleaned_data.append(tweet) # appends all individual sentences to form a list 



'''The Count Vectorizer function converts a list of words into bag of words
max_features is set to 3000 which means, only 3000 of the most occurring words are used to create a bag of words
stop_words is used to remove words that frequently appear in the dataset which have no sentiment'''

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=200, stop_words=["https", "euro", "final", "england"])
bag_of_words = cv.fit_transform(cleaned_data).toarray()


y_pred = model.predict(bag_of_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
tweets_df1["Sentiment"] = y_pred

In [None]:
tweets_df1.Sentiment.unique()

array([1., 0., 2.])