## Text Pre-processing

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import time
import string
import warnings
import glob
import os

# for all NLP related operations on text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.classify import NaiveBayesClassifier
from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# To mock web-browser and scrap tweets
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# To consume Twitter's API
import tweepy
from tweepy import OAuthHandler 

# To identify the sentiment of text
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.np_extractors import ConllExtractor

# ignoring all the warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# downloading stopwords corpus
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('conll2000')
nltk.download('brown')
nltk.download('stopwords')
stopwords = set(stopwords.words("english"))

# for showing all the plots inline
%matplotlib inline

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Merge all .csv files

In [2]:
from pathlib import Path
path = 'raw data'
file_list = os.listdir('raw data')

full_path = []

for f in file_list:
    full_path.append(os.path.join(path,f))
    
full_path

['raw data/The Open_2022-01-01_2022-12-31.csv',
 'raw data/EFL Play-Off_2022-01-01_2022-12-31.csv',
 'raw data/World Athletics Championships_2022-01-01_2022-12-31.csv',
 'raw data/EFL League_2022-01-01_2022-12-31.csv',
 'raw data/Rugby World Cup_2022-01-01_2022-12-31.csv',
 "raw data/women's six nations_2022-01-01_2022-12-31.csv",
 'raw data/Super Bowl_2022-01-01_2022-12-31.csv',
 'raw data/World Cup_2022-01-01_2022-12-31.csv',
 'raw data/Invictus Games_2022-01-01_2022-12-31.csv',
 "raw data/Women's EURO_2022-01-01_2022-12-31.csv",
 'raw data/Winter Paralympics_2022-05-12_2022-12-31.csv',
 'raw data/Rainbow Laces_2022-01-01_2022-12-31.csv']

In [3]:
excl_list = []
for file in full_path:
    if file.endswith('.csv'):
        df = pd.read_csv(file)
        df['source'] = file
        excl_list.append(df)
        

# concatenate all DataFrames in the list
# into a single DataFrame, returns new DataFrame.
excl_merged = pd.concat(excl_list, ignore_index=True)
 
# exports the dataframe into excel file
# with specified name.
excl_merged.to_csv('combine_test.csv', index=False)

In [4]:
tweets_df = pd.read_csv('combine_test.csv')

# Get Event Name
tweets_df['source'] = tweets_df['source'].replace(regex=r"\_.*",value="").str.replace('raw data/','')
tweets_df['source'].unique()

array(['The Open', 'EFL Play-Off', 'World Athletics Championships',
       'EFL League', 'Rugby World Cup', "women's six nations",
       'Super Bowl', 'World Cup', 'Invictus Games', "Women's EURO",
       'Winter Paralympics', 'Rainbow Laces'], dtype=object)

## Data Cleaning

In [5]:
tweets_df['tidy_tweets'] = tweets_df['Embedded_text'].str.replace("[^a-zA-Z# ]", "")

  tweets_df['tidy_tweets'] = tweets_df['Embedded_text'].str.replace("[^a-zA-Z# ]", "")


In [6]:
tweets_df = tweets_df[tweets_df['tidy_tweets']!='']
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets
0,Let's Role,@LetsRoleRPG,2022-01-01T16:30:00.000Z,"Let's Role\n@LetsRoleRPG\n·\nJan 2, 2022",We wish you a Happy New Year \nD-15 before the...,🥂,1,4,27,[],https://twitter.com/LetsRoleRPG/status/1477316...,The Open,We wish you a Happy New Year D before the open...
1,EdTrailblazer,@EdTrailblazer,2022-01-01T19:09:00.000Z,"EdTrailblazer\n@EdTrailblazer\n·\nJan 2, 2022",If you’re a secretary working at school in Ont...,,5,24,184,[],https://twitter.com/EdTrailblazer/status/14773...,The Open,If youre a secretary working at school in Onta...


In [7]:
tweets_df = tweets_df.astype(object).replace(np.nan, '0')
## tweets_df.to_csv('cleaned.csv')

In [8]:
def remove_pattern(text, pattern_regex):
    r = re.findall(pattern_regex, text)
    for i in r:
        text = re.sub(i, '', text)
    return text 

In [9]:
tweets_df['tidy_tweets'] = np.vectorize(remove_pattern)(tweets_df['tidy_tweets'], "@[\w]*: | *RT*")
tweets_df.tail(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets
28106,Mistress Of Spooky And Spice,@LauraHuntley79,2022-12-30T00:40:53.000Z,Mistress Of Spooky And Spice\n@LauraHuntley79\...,She waits for her\nWith a bouquet of love.\nHa...,🖤 🔥 🔥 🖤,3,10,43,[],https://twitter.com/LauraHuntley79/status/1608...,Rainbow Laces,She waits for herWith a bouquet of loveHalf pr...
28107,Lexi Powell,@chey134,2022-12-30T17:48:31.000Z,"Lexi Powell\n@chey134\n·\nDec 31, 2022",kholran: It’s finally happened. After almost a...,0,0,0,0,[],https://twitter.com/chey134/status/16088827384...,Rainbow Laces,kholran Its finally happened After almost a de...


In [10]:
cleaned_tweets = []

for index, row in tweets_df.iterrows():
    # Here we are filtering out all the words that contains link
    words_without_links = [word for word in row.tidy_tweets.split() if 'http' not in word]
    cleaned_tweets.append(' '.join(words_without_links))

tweets_df['tidy_tweets'] = cleaned_tweets

In [11]:
tweets_df = tweets_df.drop_duplicates(subset=['tidy_tweets'], keep=False)

In [12]:
tweets_df = tweets_df.reset_index(drop=True)
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets
0,Let's Role,@LetsRoleRPG,2022-01-01T16:30:00.000Z,"Let's Role\n@LetsRoleRPG\n·\nJan 2, 2022",We wish you a Happy New Year \nD-15 before the...,🥂,1,4,27,[],https://twitter.com/LetsRoleRPG/status/1477316...,The Open,We wish you a Happy New Year D before the open...
1,EdTrailblazer,@EdTrailblazer,2022-01-01T19:09:00.000Z,"EdTrailblazer\n@EdTrailblazer\n·\nJan 2, 2022",If you’re a secretary working at school in Ont...,0,5,24,184,[],https://twitter.com/EdTrailblazer/status/14773...,The Open,If youre a secretary working at school in Onta...


In [13]:
stopwords_set = set(stopwords)
cleaned_tweets = []

for index, row in tweets_df.iterrows():
    
    # filerting out all the stopwords 
    words_without_stopwords = [word for word in row.tidy_tweets.split() if not word in stopwords_set and '#' not in word.lower()]
    
    # finally creating tweets list of tuples containing stopwords(list) and sentimentType 
    cleaned_tweets.append(' '.join(words_without_stopwords))
    
tweets_df['tidy_tweets'] = cleaned_tweets
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets
0,Let's Role,@LetsRoleRPG,2022-01-01T16:30:00.000Z,"Let's Role\n@LetsRoleRPG\n·\nJan 2, 2022",We wish you a Happy New Year \nD-15 before the...,🥂,1,4,27,[],https://twitter.com/LetsRoleRPG/status/1477316...,The Open,We wish Happy New Year D open beta
1,EdTrailblazer,@EdTrailblazer,2022-01-01T19:09:00.000Z,"EdTrailblazer\n@EdTrailblazer\n·\nJan 2, 2022",If you’re a secretary working at school in Ont...,0,5,24,184,[],https://twitter.com/EdTrailblazer/status/14773...,The Open,If youre secretary working school Ontario prob...


In [14]:
tokenized_tweet = tweets_df['tidy_tweets'].apply(lambda x: x.split())
tokenized_tweet.head()

0          [We, wish, Happy, New, Year, D, open, beta]
1    [If, youre, secretary, working, school, Ontari...
2    [AIZONAAZ, decide, fate, year, electing, new, ...
3    [Of, course, friends, know, alreadyBut, good, ...
4    [Why, taking, long, people, wake, understand, ...
Name: tidy_tweets, dtype: object

In [15]:
word_lemmatizer = WordNetLemmatizer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [word_lemmatizer.lemmatize(i) for i in x])
tokenized_tweet.head()

0          [We, wish, Happy, New, Year, D, open, beta]
1    [If, youre, secretary, working, school, Ontari...
2    [AIZONAAZ, decide, fate, year, electing, new, ...
3    [Of, course, friend, know, alreadyBut, good, s...
4    [Why, taking, long, people, wake, understand, ...
Name: tidy_tweets, dtype: object

In [16]:
for i, tokens in enumerate(tokenized_tweet):
    tokenized_tweet[i] = ' '.join(tokens)

tweets_df['tidy_tweets'] = tokenized_tweet

In [17]:
tweets_df['Comments']

0         1
1         5
2         3
3        45
4        75
         ..
25448     0
25449     2
25450     0
25451     3
25452     0
Name: Comments, Length: 25453, dtype: object

In [18]:
tweets_df['Timestamp'] = tweets_df['Timestamp'].astype('datetime64[ns]')
tweets_df['Likes'] = tweets_df['Likes'].str.replace(',','').str.replace('K','').astype('float')
tweets_df['Comments'] = tweets_df['Comments'].str.replace(',','').str.replace('K','').astype('float')
tweets_df['Retweets'] = tweets_df['Retweets'].str.replace(',','').str.replace('K','').astype('float')
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25453 entries, 0 to 25452
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   UserScreenName  25453 non-null  object        
 1   UserName        25453 non-null  object        
 2   Timestamp       25453 non-null  datetime64[ns]
 3   Text            25453 non-null  object        
 4   Embedded_text   25453 non-null  object        
 5   Emojis          25453 non-null  object        
 6   Comments        25453 non-null  float64       
 7   Likes           25453 non-null  float64       
 8   Retweets        25453 non-null  float64       
 9   Image link      25453 non-null  object        
 10  Tweet URL       25453 non-null  object        
 11  source          25453 non-null  object        
 12  tidy_tweets     25453 non-null  object        
dtypes: datetime64[ns](1), float64(3), object(9)
memory usage: 2.5+ MB


## Get Sentiment Score
reference for the defination of SIA score: https://analyticsindiamag.com/sentiment-analysis-made-easy-using-vader/

In [19]:
def fetch_sentiment_using_SIA(text):
    sid = SentimentIntensityAnalyzer()
    polarity_scores = sid.polarity_scores(text)
    return 'neg' if polarity_scores['neg'] > polarity_scores['pos'] else 'pos'

def fetch_sentiment_score(text):
    sid = SentimentIntensityAnalyzer()
    polarity_scores = sid.polarity_scores(text)
    return polarity_scores

In [20]:
sentiments_using_SIA = tweets_df.tidy_tweets.apply(lambda tweet: fetch_sentiment_using_SIA(tweet))
tweets_df['sentiment'] = sentiments_using_SIA
pd.DataFrame(sentiments_using_SIA.value_counts())

Unnamed: 0,tidy_tweets
pos,21746
neg,3707


In [21]:
tweets_df['sentiment_score'] = tweets_df.tidy_tweets.apply(lambda tweet: fetch_sentiment_score(tweet))

In [22]:
tweets_df["sentiment_score"].apply(pd.Series)
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets,sentiment,sentiment_score
0,Let's Role,@LetsRoleRPG,2022-01-01 16:30:00,"Let's Role\n@LetsRoleRPG\n·\nJan 2, 2022",We wish you a Happy New Year \nD-15 before the...,🥂,1.0,4.0,27.0,[],https://twitter.com/LetsRoleRPG/status/1477316...,The Open,We wish Happy New Year D open beta,pos,"{'neg': 0.0, 'neu': 0.439, 'pos': 0.561, 'comp..."
1,EdTrailblazer,@EdTrailblazer,2022-01-01 19:09:00,"EdTrailblazer\n@EdTrailblazer\n·\nJan 2, 2022",If you’re a secretary working at school in Ont...,0,5.0,24.0,184.0,[],https://twitter.com/EdTrailblazer/status/14773...,The Open,If youre secretary working school Ontario prob...,neg,"{'neg': 0.167, 'neu': 0.669, 'pos': 0.164, 'co..."


In [65]:
# Working in progress for codes here
# tweets_df['sentiment'].iloc[]

# # tweets_df.sentiment.apply(lambda tweet: sen_pos(tweet))

In [23]:
tweets_df = pd.concat([tweets_df, tweets_df["sentiment_score"].apply(pd.Series)], axis=1)
tweets_df = tweets_df.drop(columns="sentiment_score")

In [24]:
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets,sentiment,neg,neu,pos,compound
0,Let's Role,@LetsRoleRPG,2022-01-01 16:30:00,"Let's Role\n@LetsRoleRPG\n·\nJan 2, 2022",We wish you a Happy New Year \nD-15 before the...,🥂,1.0,4.0,27.0,[],https://twitter.com/LetsRoleRPG/status/1477316...,The Open,We wish Happy New Year D open beta,pos,0.0,0.439,0.561,0.7506
1,EdTrailblazer,@EdTrailblazer,2022-01-01 19:09:00,"EdTrailblazer\n@EdTrailblazer\n·\nJan 2, 2022",If you’re a secretary working at school in Ont...,0,5.0,24.0,184.0,[],https://twitter.com/EdTrailblazer/status/14773...,The Open,If youre secretary working school Ontario prob...,neg,0.167,0.669,0.164,-0.0258


## Pivot Table

In [25]:
tweets_df.to_csv('sentiment_analysis_test.csv')
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25453 entries, 0 to 25452
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   UserScreenName  25453 non-null  object        
 1   UserName        25453 non-null  object        
 2   Timestamp       25453 non-null  datetime64[ns]
 3   Text            25453 non-null  object        
 4   Embedded_text   25453 non-null  object        
 5   Emojis          25453 non-null  object        
 6   Comments        25453 non-null  float64       
 7   Likes           25453 non-null  float64       
 8   Retweets        25453 non-null  float64       
 9   Image link      25453 non-null  object        
 10  Tweet URL       25453 non-null  object        
 11  source          25453 non-null  object        
 12  tidy_tweets     25453 non-null  object        
 13  sentiment       25453 non-null  object        
 14  neg             25453 non-null  float64       
 15  ne

In [32]:
dfp = tweets_df.pivot_table(index = ['source'], 
                      values = ['tidy_tweets','Likes','Retweets','Comments','compound','sentiment'],
                      aggfunc = {'tidy_tweets':'count',
                                  'Likes':sum,
                                 'Retweets':sum,
                                 'Comments':sum,
                                 'compound':np.mean,
                           "sentiment": 'count'})
dfp['compound'] = dfp['compound'].round(2)

In [33]:
dfp.columns = [x.capitalize() for x in dfp.columns]
dfp

Unnamed: 0_level_0,Comments,Likes,Retweets,Compound,Sentiment,Tidy_tweets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
EFL League,11018.0,39075.0,279646.7,0.26,4199,4199
EFL Play-Off,8129.0,10401.0,47262.0,0.38,801,801
Invictus Games,23864.0,123539.0,561015.6,0.26,4503,4503
Rainbow Laces,17337.0,17665.0,120921.0,0.37,2524,2524
Rugby World Cup,2701.0,12315.0,61932.9,0.28,363,363
Super Bowl,43034.0,243891.2,734563.1,0.58,359,359
The Open,42345.0,249248.1,722767.7,0.09,365,365
Winter Paralympics,60.0,339.0,1250.0,0.34,208,208
Women's EURO,7057.0,38159.0,212308.7,0.34,3989,3989
World Athletics Championships,1363.0,11005.0,48315.0,0.64,356,356


In [34]:
dfp['Comments'] = dfp['Comments']/dfp['Tidy_tweets']
dfp['Likes'] = dfp['Likes']/dfp['Tidy_tweets']
dfp['Retweets'] = dfp['Retweets']/dfp['Tidy_tweets']

In [35]:
dfp

Unnamed: 0_level_0,Comments,Likes,Retweets,Compound,Sentiment,Tidy_tweets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
EFL League,2.623958,9.305787,66.598404,0.26,4199,4199
EFL Play-Off,10.148564,12.985019,59.003745,0.38,801,801
Invictus Games,5.299578,27.434821,124.587075,0.26,4503,4503
Rainbow Laces,6.868859,6.998811,47.908479,0.37,2524,2524
Rugby World Cup,7.440771,33.92562,170.61405,0.28,363,363
Super Bowl,119.871866,679.362674,2046.136769,0.58,359,359
The Open,116.013699,682.871507,1980.185479,0.09,365,365
Winter Paralympics,0.288462,1.629808,6.009615,0.34,208,208
Women's EURO,1.769115,9.566057,53.22354,0.34,3989,3989
World Athletics Championships,3.828652,30.912921,135.716292,0.64,356,356


In [36]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

dfp[['Comments','Likes','Retweets','Tidy_tweets']] = scaler.fit_transform(dfp[['Comments','Likes','Retweets','Tidy_tweets']])
dfp

Unnamed: 0_level_0,Comments,Likes,Retweets,Compound,Sentiment,Tidy_tweets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
EFL League,0.01953,0.011268,0.029699,0.26,4199,0.655767
EFL Play-Off,0.082454,0.016668,0.025976,0.38,801,0.097437
Invictus Games,0.041905,0.037879,0.058123,0.26,4503,0.705718
Rainbow Laces,0.055028,0.007881,0.020537,0.37,2524,0.380546
Rugby World Cup,0.05981,0.047407,0.080683,0.28,363,0.025468
Super Bowl,1.0,0.994849,1.0,0.58,359,0.024811
The Open,0.967737,1.0,0.967673,0.09,365,0.025797
Winter Paralympics,0.0,0.0,0.0,0.34,208,0.0
Women's EURO,0.012382,0.01165,0.023143,0.34,3989,0.621262
World Athletics Championships,0.029604,0.042985,0.063578,0.64,356,0.024318


In [37]:
dfp.to_csv('sentiment_analysis_score.csv')