In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline


endsars2017 = pd.read_csv('endsars_2017.csv')
 
endsars2018 = pd.read_csv('endsars_2018.csv')

endsars2019 = pd.read_csv('endsars_2019.csv')

endsars2020 = pd.read_csv('endsars_2020.csv')

endsars2021 = pd.read_csv('endsars_2021.csv')
print('endsars2017: ',len(endsars2017),
     '\nendsars2018: ',len(endsars2018),
     '\nendsars2019: ',len(endsars2019),
     '\nendsars2020: ',len(endsars2020),
     '\nendsars2021: ',len(endsars2021))

data = pd.concat([endsars2017, endsars2018, endsars2019, endsars2020, endsars2021], axis=0, ignore_index=True)

data.head(3)

### Date Engineering

data['Month'] = data['date'].str[5:7]
data['Day'] = data['date'].str[8:10]
data['Year'] = data['date'].str[0:4]

data.info()



data['full_date'] = data.Month + '/' + data.Day + '/' + data.Year

import datetime
import calendar

def findDay(date):
    day = datetime.datetime.strptime(date, '%m/%d/%Y').weekday()
    return (calendar.day_name[day])


data['dayofdate'] = data['full_date'].apply(findDay)

data['Month'].replace({'01':'January',
                         '02':'February',
                         '03':'March',
                      '04':'April','05':'May','06':'June','07':'July',
                      '08':'August','09':'September','10':'October','11':'November','12':'December'}, inplace=True)

data.head(3)

### Removing Unwanted Columns

data.columns

df = pd.DataFrame(data, columns=['content','Month', 'Day', 'Year', 'full_date', 'dayofdate'])

df.to_csv('eda2.csv', index=False)

### Text Cleaning

In [None]:
df = pd.read_csv('eda2.csv')
df.head(3)

# renaming the content column to tweets
df.rename({'content':'tweets'}, axis =1, inplace=True)

df.head(3)

import tweepy
from textblob import TextBlob

import tweepy
from textblob import TextBlob
from wordcloud import WordCloud
import re
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

def text_cleaning(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Removing all @mentions
    text = re.sub(r'#', '', text) # Removing all # symbols
    text = re.sub(r'RT[\s]+', '', text) # Removing RT
    text = re.sub(r'https?:\/\/\S+', '', text) # removing all hyperlinks
    
    return text

df['tweets'] = df['tweets'].apply(text_cleaning)

### Subjectivity and Polarity

df.dropna(inplace=True)

def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def get_polarity(text):
    return TextBlob(text).sentiment.polarity

df['subjectivity'] = df['tweets'].apply(get_subjectivity)
df['polarity'] = df['tweets'].apply(get_polarity)

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

df

def check_sentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
    
# creating a new column
df['Sentiment'] = df['polarity'].apply(check_sentiment)

df



import plotly as py
import cufflinks as cf

from plotly.offline import iplot

py.offline.init_notebook_mode(connected=True)
cf.go_offline()

df.to_csv('endsars_sentiment.csv',index=False)

### Data Import

df = pd.read_csv('endsars_sentiment.csv')
df.head(3)

df.isnull().sum()

df.dropna(subset = ['tweets'], inplace = True)

df.isnull().sum()

all_tweets = ' '.join(df['tweets'].tolist())

all_tweets

### FEATURE ENGINEERING

df['review_len'] = df['tweets'].apply(lambda x: len(x)) #calculate the length of each tweets
df['word_len'] = df['tweets'].apply(lambda x: len(x.split())) #calculate the number of words in each tweets


#function to calculate the average number of words for tweets
def avg_num_word(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len = word_len + len(word)
    
    return word_len/(len(words)+0.00001) # we add 0.00001 to prevent 0 divisiblity error

df['avg_word_len'] = df['tweets'].apply(lambda x: avg_num_word(x))
df['avg_word_len'] = df['avg_word_len'].apply(lambda x: int(round(x,0))) 

df


df.to_csv('engineered_data.csv', index=False)

In [None]:
### Distribution of Sentiment Polarity

df = pd.read_csv('engineered_data.csv')

df.iplot()

df['polarity'].iplot()

df['polarity'].iplot()
df['subjectivity'].iplot()

df['polarity'].iplot(kind= 'hist', color='red', bins = 50,
                    xTitle = 'polarity', yTitle = 'Count', title = 'Sentiment Polarity Distribution')

df['subjectivity'].iplot(kind= 'hist', color='blue', bins = 50,
                    xTitle = 'subjectivity', yTitle = 'Count', title = 'Subjectivity Distribution')

df['polarity'].iplot(kind= 'hist', color='red', bins = 50,
                    xTitle = 'polarity', yTitle = 'Count', title = 'Sentiment Polarity Distribution')

df['subjectivity'].iplot(kind= 'hist', color='blue', bins = 50,
                    xTitle = 'subjectivity', yTitle = 'Count', title = 'Subjectivity Distribution')

### Distibution of Day by Year

df['Year'].iplot(kind='hist', xTitle = 'Year', yTitle = 'Count', title = 'Year Distribution')

df['Month'].iplot(kind='bar', xTitle = 'Month', yTitle = 'Count', title = 'Month Distribution')

### Distribution of Tweet Lenghts and Word Length

df['review_len'].iplot(kind='hist', xTitle = 'Review Text Length', yTitle = 'Count', title = 'Review Lenght Distribution')

df['word_len'].iplot(kind='hist', xTitle = 'Number of Word in Tweets', yTitle = 'Count', title = 'Word Length Distribution')

df['avg_word_len'].iplot(kind='hist', xTitle = 'Average Word Length', yTitle = 'Count', title = 'Average Word Length Distribution')

### Categorical Distribution

df.groupby(['Month','Year']).count()

df.groupby(['Month']).count()

df.groupby(['Year']).count()

