# Project: Bank Sentiment Analysis 

## Summary 
Identify customer sentiment of the top 5 Banks in South Africa via twitter sentiment analysis scoring
#### Operations:
1. Twint to scrape tweets of the top 5 banks in South Africa \ 
*  Standard Bank
* Nedbank 
* Absa 
* FNB 
* Capitec

2. Clean tweets with WordPunctTokenizer and Regex 
3. TextBlog to process sentiment of tweets 
5. Matplotlib / Seaborn to visualise and analyze

#### Project 2:
Create a custom model to identify Sentiment Analysis 

#### Project 3:
Compare results to the Customer Satifaction Index (CSI) and determine if the CSI is a correct reflection of the consumer sentiment
-> CSI is obtained by ____

### Important Note 
This notebook is for analysis and confirmation of the process(the POC)\
The full customer built sentiment model is in a further notebook found on my github: https://github.com/Slyth3

### Twint  guide

<b>My reference guide: </b>
https://github.com/Slyth3/Twitter_NLP/blob/main/Quick%20Twint%20Code.txt

<b> Official Github: </b>
https://github.com/twintproject/twint

In [None]:
import twint
import pandas as pd
import nest_asyncio             
import matplotlib.pyplot as plt

#optional: for reading and concatenation of previous files
import glob                     
import os

import numpy as np
import datetime as dt
import seaborn as sns

#cleaning
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords             

# Sentiment Analysis
from textblob import TextBlob

#word cloud
from wordcloud import WordCloud

### Configure and run Twint (twitter scrapper)

In [None]:
#for compatibility issues with twint
nest_asyncio.apply()  

In [None]:
bank_search = {"FNB":"FNBSA", "StandardBank":"StandardBankZA OR \"Standard Bank\" OR \"standard bank\"","Nedbank":"Nedbank OR nedbank","ABSA": "Absa OR ABSA OR absa OR AbsaSouthAfrica","Capitec":"CapitecBankSA OR Capitec or capitec"}

In [None]:
def twintConfig(date_from,date_to, search_string):
    c = twint.Config()
    c.Search = search_string[1]
    c.Since = date_from
    c.Until = date_to
    c.Pandas = True
    c. Pandas_au = True          
    c.Pandas_clean=True
    #c.Hide_output = True
    #c.Resume = "./ResumeID/resume_id_"+search_string[0]+".txt"
    twint.run.Search(c)

### Run twint

In [None]:
since = input("Input a start date eg 2021-09-17: ")
until = input("Input an end date eg 2021-09-18: ")

In [None]:
def Run_Twint(search_vals):
    
    #set empty dataframe for join
    out_df= pd.DataFrame()
    
    for bank in search_vals.items():
        print ("running for search item: "+bank[0]+"\n")
        print ("Search string: "+bank[1]+"\n")
        
        #run twint
        twintConfig(since,until, bank)
        
        #get dataframe
        tweets_df = twint.storage.panda.Tweets_df
        
        #join Dataframes and create 'Bank' column
        tweets_df["Bank"]= bank[0]
        out_df = pd.concat([out_df,tweets_df])
        
    return out_df

In [None]:
tweets_df = Run_Twint(bank_search)

# precleaning

In [None]:
#Import/export file
#base_tweets.to_csv("./Output/pre_cleaning____.csv")
tweets_df = pd.read_pickle(r"C:\Users\andre\OneDrive\Desktop\Personal\Data Science\Projects\Bank_NLP_Twitter_POC - Copy\Output\precleaning_2021.pickle")

In [None]:
tweets_df.shape

In [None]:
tweets_df = tweets_df.drop(["Unnamed: 0", 'created_at', 'user_id_str', 'link', 'urls', 'photos', 'video',
       'thumbnail', 'retweet','nreplies', 'nretweets', 'quote_url', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],axis = 1)

In [None]:
tweets_df.head(2)

#### Language analysis 

Although the language tag doesnt seem to get it right 100% of the time, we will drop these rows that arent english but keep undefined:
* und = undefined --- this will also include tweets with only hashtags so we will keep this
* en = english 

In [None]:
tweets_df["language"].unique()

In [None]:
# remove all rows where language is not english or undefined
tweets_df = tweets_df[tweets_df["language"].isin([ 'und', 'en'])]

#### Remove unnecessary rows 
* Remove tweets from Bank owned accounts i.e. FNBSA
* Remove duplicates where tweet, bank and date are the same 
* Reindex dataframe

In [None]:
# remove rows where username is in bank_search
tweets_df = tweets_df[ ~tweets_df["username"].str.lower().str.contains('fnb|standardbank|nedbank|absa|capitec',regex = True)]

In [None]:
#Drop duplicated tweets 
tweets_df = tweets_df.drop_duplicates(subset=['date',"tweet","Bank"],keep="first")

In [None]:
# reset the index for visualisation later
tweets_df.reset_index(inplace=True)
tweets_df.drop("index",axis =1,inplace=True)

In [None]:
len(tweets_df)

### Cleaning tweet data 
* Remove punctuation, hashtags, symbols etc

***Note:*** 
Cleaning will take a very long time depeding on your size of data and processing speeds \
In order to do parallel processing download the file (https://github.com/Slyth3/Sentiment-analysis-on-South-African-Banks/blob/main/multi_clean.py) \
Then import this file and run using the below:
* import multi_clean
* import multiprocessing as mp
*from multiprocessing import  Pool
* p = mp.Pool(mp.cpu_count())
* cleaned_list = p.map(multi_clean.clean_text,base_tweets["tweet"])
* p.close()

#### Tweet cleaning 

In [None]:
def clean_text(text):  
    pat1 = r'@[^ ]+'                   #@signs
    pat2 = r'https?://[A-Za-z0-9./]+'  #links
    pat3 = r'\'s'                      #floating s's
    pat4 = r'\#\w+'                     # hashtags
    pat5 = r'&amp '
    pat6 = r'[^A-Za-z\s]'         #remove non-alphabet
    combined_pat = r'|'.join((pat1, pat2,pat3,pat4,pat5, pat6))
    text = re.sub(combined_pat,"",text).lower()
    return text.strip()

In [None]:
%%time
tweets_df["cleaned_tweet"] = tweets_df["tweet"].apply(clean_text)

In [None]:
#drop empty rows
tweets_df = tweets_df [ ~(tweets_df["tweet"] =="")]

In [None]:
tweets_df["cleaned_tweet"].head()

## Sentiment analysis (TextBlob)

In [None]:
%%time
print("Running sentiment process")
for row in tweets_df.itertuples():
    tweet = tweets_df.at[row[0], 'cleaned_tweet']

    #run sentiment using TextBlob
    analysis = TextBlob(tweet)

    #set value to dataframe
    tweets_df.at[row[0], 'polarity'] = analysis.sentiment[0]
    tweets_df.at[row[0], 'subjectivity'] = analysis.sentiment[1]

    #Create Positive / negative column depending on polarity
    if analysis.sentiment[0]>0:
        tweets_df.at[row[0], 'Sentiment'] = "Positive"
    elif analysis.sentiment[0]<0:
        tweets_df.at[row[0], 'Sentiment'] = "Negative"
    else:
        tweets_df.at[row[0], 'Sentiment'] = "Neutral"

In [None]:
tweets_df[["cleaned_tweet","polarity","Sentiment"]].head(5)

In [None]:
# Import / Export
#tweets_df = pd.read_pickle(r"C:\Users\andre\OneDrive\Desktop\Personal\Data Science\Projects\Bank_NLP_Twitter_POC - Copy\Output\Final_2021.pickle")
#tweets_df.to_pickle("Final_2021.pickle")

## Create rolling Mean / Expanding 

In [None]:
tweets_df["date"] = pd.to_datetime(tweets_df["date"])

#set index = date so as to create rolling mean 
tweets_df = tweets_df.sort_values("date").set_index("date")

In [None]:
#Create bank Dataframes 
Standard_df = tweets_df[(tweets_df.Bank=="StandardBank")]
FNB_df = tweets_df[(tweets_df.Bank=="FNB")]
Nedbank_df = tweets_df[(tweets_df.Bank=="Nedbank")]
ABSA_df = tweets_df[(tweets_df.Bank=="ABSA")]

#### Get rolling/ expanding mean 

In [None]:
# stop this warning as the chaining is fine
pd.options.mode.chained_assignment = None 

Nedbank_df['mean'] = Nedbank_df['polarity'].expanding().mean()
Nedbank_df['rolling'] = Nedbank_df['polarity'].rolling("7d").mean()

In [None]:
#Std Bank
Standard_df['mean'] = Standard_df['polarity'].expanding().mean()
Standard_df['rolling'] = Standard_df['polarity'].rolling("7d").mean()

#FNB
FNB_df['mean'] = FNB_df['polarity'].expanding().mean()
FNB_df['rolling'] = FNB_df['polarity'].rolling("7d").mean()

#Nebank
Nedbank_df['mean'] = Nedbank_df['polarity'].expanding().mean()
Nedbank_df['rolling'] = Nedbank_df['polarity'].rolling("7d").mean()

#ABSA
ABSA_df['mean'] = ABSA_df['polarity'].expanding().mean()
ABSA_df['rolling'] = ABSA_df['polarity'].rolling("7d").mean()

#Capitec
Cap_df['mean'] = Cap_df['polarity'].expanding().mean()
Cap_df['rolling'] = Cap_df['polarity'].rolling("7d").mean()

## Analysis and visualisation

In [None]:
#install additional libraries for visualisation 
import ast #optional
from collections import Counter

import cufflinks as cf
from plotly.offline import init_notebook_mode #, plot, iplot, download_plotlyjs
init_notebook_mode(connected = True)
cf.go_offline()

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# Set Pallette 
sns.set_theme()
pal = {"FNB":'c', "StandardBank":"b","ABSA":"r","Nedbank":"g", "Capitec": "grey"}

In [None]:
fig, ax = plt.subplots(1,2, figsize= (15,5))

sns.countplot(ax = ax[0], x= tweets_df["Bank"], palette= pal)
ax[0].set_title("Count of tweets")

sns.barplot(data =tweets_df, x = "Bank" ,y = "nlikes",estimator=np.sum,ci=None, palette=pal)
ax[1].set_title("Count of likes")

plt.tight_layout()
plt.show()

In [None]:
# number of tweets
tweets_df[["cleaned_tweet","Bank"]].groupby(["Bank"]).count().transpose()

In [None]:
plt.figure(figsize=(12,7))
sns.histplot(tweets_df, x="Sentiment", hue="Bank", palette= pal,multiple="stack", alpha = 1)
plt.title("Count of tweets by sentiment",fontsize =15)

plt.tight_layout()
plt.show()

In [None]:
fig1 = sns.displot(tweets_df, x="Sentiment", col="Bank",col_wrap= 2, hue="Bank", legend=False, palette= pal)
fig1.fig.suptitle("Count of tweets by Sentiment",fontsize =15)

plt.tight_layout()
plt.show()

In [None]:
fig1 = sns.displot(data = tweets_df[~(tweets_df['polarity']==0)], x="polarity",
                   col="Bank",
                   col_wrap= 2, 
                   hue="Bank", 
                   legend=False, 
                   palette= pal,
                   kde = True,
                   bins =30)
fig1.fig.suptitle("Distribution of Sentiment scores(polarity)",fontsize =15 )

plt.tight_layout()
plt.show()

# Hashtag analysis 

In [None]:
#get all hashtags as list
def hashlist(df):
    hashlist = []
    for i in df['hashtags']:
        #use ast.literal if you are importing CSV files otherwise just use 'i'
        hashlist.extend(ast.literal_eval(i))
    return hashlist

In [None]:
#Create hashtag dataframes
hash_Absa= pd.DataFrame(Counter(hashlist(ABSA_df)).items()).sort_values(1,ascending=False)
hash_NedBank= pd.DataFrame(Counter(hashlist(Nedbank_df)).items()).sort_values(1,ascending=False)
hash_StdBank= pd.DataFrame(Counter(hashlist(Standard_df)).items()).sort_values(1,ascending=False)
hash_FNB= pd.DataFrame(Counter(hashlist(FNB_df)).items()).sort_values(1,ascending=False)

In [None]:
fig, ax = plt.subplots(2, 2,figsize=(15, 10))

plt.suptitle("Top 5 hashtags per bank")

# ABSA
ax[0,0].bar(hash_Absa[0].head(), hash_Absa[1].head(), color = "r")
ax[0,0].set_title("ABSA")
ax[0,0].xaxis.set_tick_params(rotation=45, size = 15)

ax[0,1].bar(hash_NedBank[0].head(), hash_NedBank[1].head(), color = "g")
ax[0,1].set_title("Nedbank")
ax[0,1].xaxis.set_tick_params(rotation=45, size = 15)

ax[1,0].bar(hash_StdBank[0].head(), hash_StdBank[1].head(), color = "b")
ax[1,0].set_title("Standard Bank")
ax[1,0].xaxis.set_tick_params(rotation=45, size = 15)

ax[1,1].bar(hash_FNB[0].head(), hash_FNB[1].head(), color = "c")
ax[1,1].set_title("FNB")
ax[1,1].xaxis.set_tick_params(rotation=45, size = 15)

plt.tight_layout()
plt.show()

# Word Cloud

In [None]:
tweetString_a = " ".join(list(ABSA_df["cleaned_tweet"])).lower()
tweetString_n = " ".join(list(Nedbank_df["cleaned_tweet"])).lower()
tweetString_s = " ".join(list(Standard_df["cleaned_tweet"])).lower()
tweetString_f = " ".join(list(FNB_df["cleaned_tweet"])).lower()

In [None]:
#remove Bank name and set wordcloud

tweetString_a = re.sub(r"absa|bank|amp","",tweetString_a)
wordcloud_a = WordCloud(
                background_color ='white', 
                min_font_size = 5).generate(tweetString_a)

tweetString_n = re.sub(r"NedBankSA|Nedbank|nedbank|bank|amp","",tweetString_n)   
wordcloud_n = WordCloud( 
                background_color ='white', 
                min_font_size = 5).generate(tweetString_n)

tweetString_s = re.sub(r"standardbankza|standard bank|bank|amp","",tweetString_s)     
wordcloud_s = WordCloud( 
                background_color ='white', 
                min_font_size = 5).generate(tweetString_s)

tweetString_f = re.sub(r"FNB|fnb|bank|amp","",tweetString_f)
wordcloud_f = WordCloud( 
                background_color ='white', 
                min_font_size = 5).generate(tweetString_f)

In [None]:
fig, ax = plt.subplots(2,2,figsize=(14, 8),sharey=True)

ax[0,0].imshow(wordcloud_s)
ax[0,1].imshow(wordcloud_f)
ax[1,0].imshow(wordcloud_n)
ax[1,1].imshow(wordcloud_a)

ax[0,0].axis("off")
ax[0,1].axis("off")
ax[1,0].axis("off")
ax[1,1].axis("off")

ax[0,0].set_title("StandardBank")
ax[0,1].set_title("FNB")
ax[1,0].set_title("Nedbank")
ax[1,1].set_title("ABSA")

plt.tight_layout() 
plt.show()

In [None]:
# Overall mean sentiment by bank
plt.figure(figsize=(10,5))
plt.title("Overall mean Sentiment by Bank")
sns.barplot(data = tweets_df, x= "Bank", y = "polarity", palette=pal, ci=False)
plt.show()

# Rolling plots

#### Create an interactive plot

In [None]:
# functions to create our graphs
def trace_rolling_creation(df,gname, glinecolor):
    return fig.add_trace(
        go.Scatter(
            x= df.index, 
            y=df["rolling"], 
            name=gname,  
            mode='lines',
            line_color=glinecolor),
        secondary_y=False
)

def trace_count_creation(df,gname, glinecolor):
    return fig.add_trace(
        go.Scatter(
            x= df.index, 
            y=df["polarity"].rolling('7d').count(), 
            name=gname,  
            fill='tozeroy',line_color=glinecolor), 
        secondary_y=True
)

In [None]:
fig = go.Figure()
fig.add_scatter(x=FNB_df.index, y=FNB_df["rolling"], name="FNB", mode='lines',line_color="#19D3F3")
fig.add_scatter(x=Standard_df.index, y=Standard_df["rolling"], name="Standard Bank", mode='lines',line_color="blue")
fig.add_scatter(x=ABSA_df.index, y=ABSA_df["rolling"], name="ABSA", mode='lines',line_color="red")
fig.add_scatter(x=Nedbank_df.index, y=Nedbank_df["rolling"], name="Nedbank", mode='lines',line_color="green")
fig.update_layout(
    template = "seaborn",
    title="Rolling 7 day Sentiment (polarity)",
    xaxis_title="Date",
    yaxis_title="7 day rolling polarity",
    yaxis_range = [-0.1,0.4],
    legend_title="Banks",
    font=dict(size=12),
    autosize=False,
    width=1000,
    height=600,
    margin=dict(l=10,r=10, b=50,t=50, pad=4)
)

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
trace_rolling_creation(ABSA_df, "ABSA", '#DC0E1A')
trace_rolling_creation(Nedbank_df, "Nedbank", '#078a4d')
trace_rolling_creation(Standard_df, "StdBank", '#054db3')
trace_rolling_creation(FNB_df, "FNB", '#19D3F3')

trace_count_creation(ABSA_df, "ABSA", 'rgb(220, 14, 26)')
trace_count_creation(Nedbank_df, "NedBank", 'rgb(7, 138, 77)')
trace_count_creation(Standard_df, "Std Bank", 'rgb(5, 77, 179)')
trace_count_creation(FNB_df, "FNB", 'rgb(25, 211, 243)')
# set figure layout
fig.update_layout(
    template = "seaborn",
    title_text="Rolling 7d Sentiment vs Count of tweets",
    legend_title="Banks",
    font=dict(size=12),
    autosize=False,
    width=1000,
    height=600,
    margin=dict (l=10,r=10,b=50,t=50, pad=2)
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="Rolling",range = [-0.1,0.4], secondary_y=False)
fig.update_yaxes(title_text="Count",range = [0,40000], secondary_y=True)

fig.show()

### Day / Month sentiment comparison 

In [None]:
#Create day and month
ABSA_df["Day"]= ABSA_df.index.day_name()
ABSA_df["Month"] = ABSA_df.index.month_name()
ABSA_df["Hour"] = ABSA_df.index.hour
Nedbank_df["Day"]= Nedbank_df.index.day_name()
Nedbank_df["Month"] = Nedbank_df.index.month_name()
Nedbank_df["Hour"] = Nedbank_df.index.hour
Standard_df["Day"]= Standard_df.index.day_name()
Standard_df["Month"] = Standard_df.index.month_name()
Standard_df["Hour"] = Standard_df.index.hour
FNB_df["Day"]= FNB_df.index.day_name()
FNB_df["Month"] = FNB_df.index.month_name()
FNB_df["Hour"] = FNB_df.index.hour

In [None]:
#### Sort Day and month columns
from pandas.api.types import CategoricalDtype
days = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_type = CategoricalDtype(categories=days, ordered=True)

ABSA_df['Day'] = ABSA_df['Day'].astype(day_type)
Nedbank_df['Day'] = Nedbank_df['Day'].astype(day_type)
Standard_df['Day'] = Standard_df['Day'].astype(day_type)
FNB_df['Day'] = FNB_df['Day'].astype(day_type)

months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
month_type = CategoricalDtype(categories=months, ordered=True)
ABSA_df['Month'] = ABSA_df['Month'].astype(month_type)
Nedbank_df['Month'] = Nedbank_df['Month'].astype(month_type)
Standard_df['Month'] = Standard_df['Month'].astype(month_type)
FNB_df['Month'] = FNB_df['Month'].astype(month_type)

In [None]:
plt.figure(figsize = (15,5))
sns.lineplot(data = FNB_df.groupby("Month")["polarity"].mean(), color = "c", label = "FNB")
sns.lineplot(data = Nedbank_df.groupby("Month")["polarity"].mean(), color = "g", label = "Nedbank")
sns.lineplot(data = ABSA_df.groupby("Month")["polarity"].mean(), color = "r", label = "ABSA")
sns.lineplot(data = Standard_df.groupby("Month")["polarity"].mean(), color = "b", label = "StdBank")
plt.title("Sentiment by month")
plt.show()

In [None]:
plt.figure(figsize = (20,5))

plt.subplot(1,2,1)
plt.title("Sentiment by day")
sns.lineplot(data = FNB_df.groupby("Day")["polarity"].mean(), color = "c", label = "FNB", sort=False)
sns.lineplot(data = Nedbank_df.groupby("Day")["polarity"].mean(), color = "g", label = "Nedbank", sort=False)
sns.lineplot(data = ABSA_df.groupby("Day")["polarity"].mean(), color = "r", label = "ABSA", sort=False)
sns.lineplot(data = Standard_df.groupby("Day")["polarity"].mean(), color = "b", label = "StdBank", sort=False)

plt.subplot(1,2,2)
plt.title("Sentiment by hour")
sns.lineplot(data = FNB_df.groupby("Hour")["polarity"].mean(), color = "c", label = "FNB")
sns.lineplot(data = Nedbank_df.groupby("Hour")["polarity"].mean(), color = "g", label = "Nedbank")
sns.lineplot(data = ABSA_df.groupby("Hour")["polarity"].mean(), color = "r", label = "ABSA")
sns.lineplot(data = Standard_df.groupby("Hour")["polarity"].mean(), color = "b", label = "StdBank")

plt.show()