# TEXT PRE-PROCESSING FOR SENTIMENT ANALYSIS
## IMPORTING MODULES


In [None]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import re
import string

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
nltk.download('stopwords')
nltk.download('vader_lexicon')


from collections import Counter

from matplotlib import pyplot as plt
from matplotlib import ticker
import seaborn as sns
import plotly.express as px

sns.set(style="darkgrid")

## DATA PRE-PROCESSING

In [None]:
# For academic use the big data dataframe was written to csv then read by Pandas 
df = pd.read_csv(#FILEPATH HERE#)
df.head()
df.shape

##### Select only Needed columns

In [None]:
needed_columns=['user_name','date','text']
df = df[needed_columns]
df

##### Change the type of some columns


In [None]:
#change the user name from text to unique codes
df.user_name = df.user_name.astype('category')
df.user_name = df.user_name.cat.codes

#from date : take only date and not time 
df.date = pd.to_datetime(df.date).dt.date
pd.options.display.max_colwidth = 200
df

##### Removing URLS from tweets


In [None]:
texts =  df['text']
remove_url = lambda x : re.sub(r'https\S+','',str(x))
texts_lr = texts.apply(remove_url)
print (texts.head(2))
print (texts_lr.head(2))

##### Converting all tweets to lowercase


In [None]:
to_lower = lambda x : x.lower()
texts_lr_lc = texts_lr.apply(to_lower)
texts_lr_lc.head(2)

##### Removing punctuations


In [None]:
remove_puncs = lambda x : x.translate(str.maketrans('','',string.punctuation))
texts_lr_lc_np = texts_lr_lc.apply(remove_puncs)
texts_lr_lc_np.head(2)


##### Removing stopwords


In [None]:
more_words=['covid','#coronavirus', '#coronavirusoutbreak', '#coronavirusPandemic', '#covid19', '#covid_19', '#epitwitter', '#ihavecorona', 'amp', 'coronavirus', 'covid19']
stop_words = set(stopwords.words('english'))
stop_words.update(more_words)

remove_words = lambda x : ' '.join([word for word in x.split() if word not in stop_words])
texts_lr_lc_np_ns = texts_lr_lc_np.apply(remove_words)
texts_lr_lc_np_ns

##### Put the cleaned text in main data


In [None]:
df['text']= texts_lr_lc_np_ns


## GENERATING POLARITY


##### Getting the polarity scores for each tweet


In [None]:
sid = SentimentIntensityAnalyzer()
ps = lambda x : sid.polarity_scores(x)
sentiment_scores = df.text.apply(ps)
sentiment_scores

In [None]:
sentiment_df = pd.DataFrame(data = list(sentiment_scores))
sentiment_df

##### Labelling the scores based on the compound polarity value 


In [None]:
labelize = lambda x : 'Neutral' if x==0 else ('Positive' if x>0 else 'Negative')
sentiment_df['label'] = sentiment_df.compound.apply(labelize)
sentiment_df


##### Join the two dataframes


In [None]:
data = df.join(sentiment_df.label)
data = df.join(sentiment_df.compound)
#final dataframe has username, text, sentimentscore and sentimentlabel
data

##### Plotting the sentiment score counts 

In [None]:
counts_df = data.label.value_counts().reset_index
counts_df

In [None]:
#if your ML notebook is independent from this one, convert the final dataframe to csv format:
#df.to_csv (r'\...\exported_dataframe.csv', index = False, header=True)
#The ML notebook will work wit this notebook as its base