In [1]:
import json
import csv
import ast
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import re
from datetime import date, datetime
import time
import numpy as np
import nltk
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.sentiment import SentimentIntensityAnalyzer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize
import statistics
sentiment = SentimentIntensityAnalyzer()

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline


# Import Files

In [2]:
# These are the data loaded for sentiment analysis
df_clean_cat_bcovid = pd.read_csv(r'df_clean_cat_bcovid.csv')
df_clean_cat_acovid = pd.read_csv(r'df_clean_cat_acovid.csv')

# This is the training data for LSTM model
df = pd.read_csv('Twitter_Data.csv')

In [54]:
df_clean_cat_all = pd.concat([df_clean_cat_bcovid,df_clean_cat_acovid], axis = 0)

# Train data to identify sentitments with Twitter data

## Clean the twitter data

In [4]:
# drop missing rows
df.dropna(axis=0, inplace=True)

# Map tweet categories
df['category'] = df['category'].map({-1.0:'Negative', 0.0:'Neutral', 1.0:'Positive'})
# Output first five rows
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive


In [5]:
import re    # RegEx for removing non-letter characters

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *


def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

print("\nOriginal tweet ->", df['clean_text'][0])
print("\nProcessed tweet ->", tweet_to_words(df['clean_text'][0]))


Original tweet -> when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples

Processed tweet -> ['modi', 'promis', 'minimum', 'govern', 'maximum', 'govern', 'expect', 'begin', 'difficult', 'job', 'reform', 'state', 'take', 'year', 'get', 'justic', 'state', 'busi', 'exit', 'psu', 'templ']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivyha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Train data

### Create Function to Train Data

In [None]:
# Apply data processing to each tweet
X = list(map(tweet_to_words, df['clean_text']))

max_words = 5000
max_len=50

def tokenize_pad_sequences(text):
    '''
    This function tokenize the input text into sequnences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, tokenizer

print('Before Tokenization & Padding \n', df['clean_text'][0])
X, tokenizer = tokenize_pad_sequences(df['clean_text'])
print('After Tokenization & Padding \n', X[0])

### Execute Data Training

In [7]:
# Convert categorical variable into dummy/indicator variables.
y = pd.get_dummies(df['category'])
# Train and Test split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
# Extracting validation set from the train set
valid_size=1000
X_valid, y_valid = X_train[-valid_size:], y_train[-valid_size:]
X_test, y_test = X_train[:-valid_size], y_train[:-valid_size]

print('Train Set ->', X_train.shape, y_train.shape)
print('Validation Set ->', X_valid.shape, y_valid.shape)
print('Test Set ->', X_test.shape, y_test.shape)

Train Set -> (114078, 50) (114078, 3)
Validation Set -> (1000, 50) (1000, 3)
Test Set -> (113078, 50) (113078, 3)


### Create Function to Calculate Accuracy

In [8]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val*100

### Calculate Accuracy

#### Precision = True Positive/(True Positive + False Positive)

Ratio of correctly predicted positive observations to the total predicted positive observations.

#### Recall = True Positive/(True Positive + False Negative)

Ratio of correctly predicted positive observations to all the actual positive observations.

#### Accuracy = (True Positive + True Negative)/(True Positive + False Positive + False Negative + True Negative)

A ratio of correctly predicted observation to the total observations.

#### F1 Score = 2*((Precision*Recall)/(Precision + Recall))

Weighted average of Precision and Recall. Needed when you want to seek balance between Precision and Recall.  

In [9]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.metrics import Precision, Recall

vocab_size = 5000
embedding_size = 32

# Build model
model3 = Sequential()
model3.add(Embedding(vocab_size, embedding_size, input_length=max_len))
model3.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Bidirectional(LSTM(32)))
model3.add(Dropout(0.4))
model3.add(Dense(3, activation='softmax'))

print(model3.summary())

# Compile model
model3.compile(loss='categorical_crossentropy', optimizer='adam', 
               metrics=['accuracy', Precision(), Recall()])

# Train model
num_epochs = 1
batch_size = 32
history3 = model3.fit(X_train, y_train,
                      validation_data=(X_valid, y_valid),
                      batch_size=batch_size, epochs=num_epochs)

# Evaluate model on the test set
loss, accuracy, precision, recall = model3.evaluate(X_test, y_test, verbose=0)
# Print metrics
print('')
print('CNN + LSTM Accuracy  : {:.2f}'.format(100 * accuracy), '%')
print('CNN + LSTM Precision : {:.2f}'.format(100 * precision), '%')
print('CNN + LSTM Recall    : {:.2f}'.format(100 * recall), '%')
print('CNN + LSTM F1 Score  : {:.2f}'.format(f1_score(precision, recall)), '%')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 32)            160000    
_________________________________________________________________
conv1d (Conv1D)              (None, 50, 32)            3104      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 25, 32)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                16640     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 3)                 195       
Total params: 179,939
Trainable params: 179,939
Non-trainable params: 0
__________________________________________________

# Create LSTM Function to Execute Trained Model for Sentiment Analysis

In [11]:
def predict_class(text):
    '''Function to predict sentiment class of the passed text'''
    
    sentiment_classes = ['Negative', 'Neutral', 'Positive']
    max_len=50
    
    # Transforms text to a sequence of integers using a tokenizer object
    xt = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    xt = pad_sequences(xt, padding='post', maxlen=max_len)
    # Do the prediction using the loaded model
    yt = model3.predict(xt).argmax(axis=1)
    # Print the predicted sentiment
    # Print('The predicted sentiment is', sentiment_classes[yt[0]])  
    return sentiment_classes[yt[0]]

In [12]:
def sentiment_text(df):
    sent = []

    for t in df['text']: 
        sent.append(predict_class([t]))

    df['sentiment'] = sent      

# Execute LSTM Sentiment Analysis

Sentiment analysis will be run on bcovid, acovid, and all dataframes. 
This also applies for all visualizations done later. 

In [55]:
sentiment_text(df_clean_cat_bcovid)
sentiment_text(df_clean_cat_acovid)
sentiment_text(df_clean_cat_all)

# Visualize Sentiment Frequncy

### Bus

In [None]:
# Before Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_bcovid[df_clean_cat_bcovid['category'] == 'bus'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments Before Covid')
sns.color_palette('pastel')

for container in ax.containers:
    ax.bar_label(container)


In [None]:
# After Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_acovid[df_clean_cat_acovid['category'] == 'bus'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

In [None]:
# All

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_all[df_clean_cat_all['category'] == 'bus'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

### MRT

In [None]:
# Before Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_bcovid[df_clean_cat_bcovid['category'] == 'mrt'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments Before Covid')

for container in ax.containers:
    ax.bar_label(container)

In [None]:
# After Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_acovid[df_clean_cat_acovid['category'] == 'mrt'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

In [None]:
# All

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_all[df_clean_cat_all['category'] == 'mrt'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

### Taxi

In [None]:
# Before Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_bcovid[df_clean_cat_bcovid['category'] == 'taxi'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments Before Covid')

for container in ax.containers:
    ax.bar_label(container)

In [None]:
# After Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_acovid[df_clean_cat_acovid['category'] == 'taxi'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

In [None]:
# All

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_all[df_clean_cat_all['category'] == 'taxi'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

### Private Hire

In [None]:
# Before Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_bcovid[df_clean_cat_bcovid['category'] == 'private hire'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments Before Covid')

for container in ax.containers:
    ax.bar_label(container)

In [None]:
# After Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_acovid[df_clean_cat_acovid['category'] == 'private hire'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

In [None]:
# All

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_all[df_clean_cat_all['category'] == 'private hire'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

### Car Rental

In [None]:
# Before Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_bcovid[df_clean_cat_bcovid['category'] == 'car rental'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments Before Covid')

for container in ax.containers:
    ax.bar_label(container)

In [None]:
# After Covid

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_acovid[df_clean_cat_acovid['category'] == 'car rental'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

In [None]:
# All

sns.set_theme(style = 'darkgrid')
ax = sns.countplot(x = 'sentiment', data = df_clean_cat_all[df_clean_cat_all['category'] == 'car rental'],
                   order = ['Negative', 'Neutral', 'Positive'], palette = ['#ad5353', '#5985b5', '#87c993'])
ax.set_title('Sentiments After Covid')

for container in ax.containers:
    ax.bar_label(container)

# Preview Negative Discussions

Here, we will preview the negatives discussions by looking at the top few discussions and looking at the word cloud of negatives discussions.

## Create Functions

In [24]:
def wc_neg(data, stopword, cat):
    t = data[data['sentiment']=='Negative']
    p = t[t['category'] == cat]

    message = p['text'].tolist()
    
    text = " ".join(list(message))

    wc = WordCloud(stopwords = stopword, background_color='white').generate(text)

    plt.imshow(wc, interpolation = 'bilinear')
    plt.axis('off')
    plt.show()
    print('\n') 

In [26]:
stopwords = set(STOPWORDS)
stopwords.update(['mrt', 'station', 'see', 'bus', 'buses', 'line', 'account', 'one', 'full',
                  'get', 'min', 'class', 'year', 'pm', '', 'take', 'taxi', 'allow', 'last', 'almost',
                  'post', 'start', 'cross', 'car_rental', 'comfortdelgro', 'singapore', 'grab',
                  'night', 'find', 'need', 'build', 'photo', 'video', 'leave', 'car', 'train', 
                  'think', 'food', 'make', 'set', 'fresh', 'sbs', 'driver', 'delgro', 'comfort',  
                  'move',  'ever', 'blue', 'uber', 'go', 'look', 'use', 'also', 'give', 
                  'many', 'come', 'lot', 'seem', 'guess', 'definitely', 'sure', 'keep', 'much', 'already', 
                  'do', 'lol', 'people', 'well', 'back', 'week', 'u', 'want', 'day', 'will', 'know', 
                  'even', 'really', 'said', 'say', 'cab', 'public transport', 'taxis', 'public', 
                  'transport', 'thing', 'still', 'got', 'now', 's', 'stop', 'around', 'another', 'stations',
                  'smrt', 'next', 'us', 'may', 'person', 'years', 'going', 'trains', 'way', 'etc', 'makes',
                  'seat', 'guy', 'https', 'always', 'riders', 'cabbie', 'sg', 'drivers', 'auntie', 'man', 
                  'uncle', 'stops', 'someone', 'something', 'andy', 'cabs', 'cabbies', 'order', 'delivery', 
                  'gojek', 'quite','fucking', 'every', 'getting', 'trying', 'told', 'something', 'singaporean', 
                  'feel', 'lta', 'fuck', 'without', 'let', 'made', 'getting', 'passenger', 'restaurant', 'grabfood', 'door', 
                  'merchant', 'don', 't', 'customer', 'cars', 'actually', 'senior', 'entrepreneur', 'rental', 'cars', 'current'])


## Execute Functions to Preview Discussions

### Bus

In [None]:
# Before Covid

t = df_clean_cat_bcovid[df_clean_cat_bcovid['sentiment']=='Negative']
p = t[t['category'] == 'bus']

print('===========================Bus Before Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_bcovid, stopwords, 'bus')

In [None]:
# After Covid

t = df_clean_cat_acovid[df_clean_cat_acovid['sentiment']=='Negative']
p = t[t['category'] == 'bus']

print('===========================Bus After Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_acovid, stopwords, 'bus')

In [None]:
# All

t = df_clean_cat_all[df_clean_cat_all['sentiment']=='Negative']
p = t[t['category'] == 'bus']

print('===========================Bus Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_all, stopwords, 'bus')

### MRT

In [None]:
# Before Covid

t = df_clean_cat_bcovid[df_clean_cat_bcovid['sentiment']=='Negative']
p = t[t['category'] == 'mrt']

print('===========================MRT Before Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_bcovid, stopwords, 'mrt')

In [None]:
# After Covid

t = df_clean_cat_acovid[df_clean_cat_acovid['sentiment']=='Negative']
p = t[t['category'] == 'mrt']

print('===========================MRT After Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_acovid, stopwords, 'mrt')

In [None]:
# All

t = df_clean_cat_all[df_clean_cat_all['sentiment']=='Negative']
p = t[t['category'] == 'mrt']

print('===========================MRT Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_all, stopwords, 'mrt')

### Taxi

In [None]:
# Before Covid

t = df_clean_cat_bcovid[df_clean_cat_bcovid['sentiment']=='Negative']
p = t[t['category'] == 'taxi']

print('===========================Taxi Before Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_bcovid, stopwords, 'taxi')

In [None]:
# After Covid

t = df_clean_cat_acovid[df_clean_cat_acovid['sentiment']=='Negative']
p = t[t['category'] == 'taxi']

print('===========================Taxi After Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_acovid, stopwords, 'taxi')

In [None]:
# All

t = df_clean_cat_all[df_clean_cat_all['sentiment']=='Negative']
p = t[t['category'] == 'taxi']

print('===========================Taxi Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_all, stopwords, 'taxi')

### Private Hire

In [None]:
# Before Covid

t = df_clean_cat_bcovid[df_clean_cat_bcovid['sentiment']=='Negative']
p = t[t['category'] == 'private hire']

print('===========================Private Hire Before Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_bcovid, stopwords, 'private hire')

In [None]:
# After Covid

t = df_clean_cat_acovid[df_clean_cat_acovid['sentiment']=='Negative']
p = t[t['category'] == 'private hire']

print('===========================Private Hire After Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_acovid, stopwords, 'private hire')

In [None]:
# All

t = df_clean_cat_all[df_clean_cat_all['sentiment']=='Negative']
p = t[t['category'] == 'private hire']

print('===========================Private Hire Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_all, stopwords, 'private hire')

### Car Rental

In [None]:
# Before Covid

t = df_clean_cat_bcovid[df_clean_cat_bcovid['sentiment']=='Negative']
p = t[t['category'] == 'car rental']

print('===========================Private Hire Before Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_bcovid, stopwords, 'car rental')

In [None]:
# After Covid

t = df_clean_cat_acovid[df_clean_cat_acovid['sentiment']=='Negative']
p = t[t['category'] == 'car rental']

print('===========================Private Hire After Covid Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_acovid, stopwords, 'car rental')

In [None]:
# All

t = df_clean_cat_all[df_clean_cat_all['sentiment']=='Negative']
p = t[t['category'] == 'car rental']

print('===========================Private Hire Title===========================')
for index,row in p[:5].iterrows():
    print(row['text'])
wc_neg(df_clean_cat_all, stopwords, 'car rental')

# Export sentiment data

In [37]:
# The code below will export the sentiment dataframes for topic modelling
# Do edit the code accordingly to state your preferred path file

df_clean_cat_bcovid.to_csv(r'df_clean_cat_bcovid_sent.csv', index = False, encoding = 'utf-8-sig' )
df_clean_cat_acovid.to_csv(r'df_clean_cat_acovid_sent.csv', index = False, encoding = 'utf-8-sig')
df_clean_cat_all.to_csv(r'df_clean_cat_all_sent.csv', index = False, encoding = 'utf-8-sig')

# Export data needed for Dashboard

Here, we will prepare the data for dashboard purposes.

We will remove the stopwords in df_clean_cat_all dataframe before it will be loaded into PowerBI.

In [53]:
def df_wc_no_stopwords(data, stopwords):
    output = []
    output_df = data
    #t = data[data['sentiment']=='Negative']
    #p = t[t['category'] == cat]
    
    for text in data['text']:
        s_text = text.split()
        result_text = [word for word in s_text if word.lower() not in stopwords]
        result = ' '.join(result_text)
        output.append(result)

    output_df['text'] = pd.Series(output)

    return output_df

In [56]:
df_clean_cat_all_export = df_wc_no_stopwords(df_clean_cat_all, stopwords)

In [60]:
# The following code will export the file for dashboard
# Do change the code accordingly to your preferred path file

df_clean_cat_all_export.to_csv(r'df_clean_cat_all_export.csv', index = False)