## Project: Bank Analysis - Tweets 
Using Twint to scrape tweets 

Use beautiful soup to clean tweets with WordPunctTokenizer


# Quick Twint code
Github - https://github.com/twintproject/twint

All functions - https://github.com/twintproject/twint/wiki/Configuration
### setup 
c.twint.Config() 

#### set username
c.Username = "realDonaldTrump"

#### set phrase search
c.Search = "great"

#### customise output
c.Custom["tweet"] = ["id"]           --- assign column names

c.Custom["user"] = ["bio"]

c.Limit = 1                          ---- limit to batches (unknown size)

c.Since = "2019–04–29"

c.Until = "2020–04–29"

#### Pandas 
c.Pandas = True

    --------once run save to dataframe
    
    ---- Tweets 
    df = twint.storage.panda.Tweets_df
    
    ---- followers 
    df = twint.storage.panda.Follow_df
    
    df = Followers_df['followers'][username]

#### Write output 
c.Store_csv = True

c.Output = "test.csv"

### Execute determines how its run- using setup above
twint.run.Search(c)    --- will run search for all tweets with the above

twint.run.Profile(c)   ---- will run against profile - return only this profiles tweets

twint.run.Followers(c) ---- get follower info 


config.Since = "2019–04–29"\
config.Until = "2020–04–29"\
config.Store_json = True


In [None]:

import twint
import pandas as pd
import nest_asyncio
nest_asyncio.apply()            #for compatibility issues 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import datetime as dt
import seaborn as sns

#cleaning
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer

# NLP
from textblob import TextBlob
#from IPython.display import Markdown, display

#word cloud and stopwords
from wordcloud import WordCloud, STOPWORDS

### Configure and run Twint (twitter scrapper)

In [None]:
bankString = "StandardBankZA OR \"Standard Bank\""
since = "2021-05-27"

def twintRun(date_from,search_string):    
    c = twint.Config()
    c.Search = search_string
    c.Pandas = True
    c.Since = date_from
    #c.Until = "2021-07-30"
    print("running \n")
    twint.run.Search(c)
    print("completed")

In [None]:
%%time
twintRun(since,bankString)

## Pandas - analyse the data

In [None]:
tweets_df = twint.storage.panda.Tweets_df

In [None]:
tweets_df.columns

In [None]:
len(tweets_df)

In [None]:
#Export to csv
tweets_df.to_csv("pre_cleaning.csv")

In [None]:
sub_tweets = tweets_df[["date", "username", "tweet", "hashtags", "nlikes","search"]]

In [None]:
sub_tweets.head(5)

### Cleaning tweet data 


In [None]:
def sub_patterns(text):  
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'  
    pat3 = r'\#\w+'                    #hashtags - already captured by Twint
    pat4 = r"\'s"                      #floating s's
    combined_pat = r'|'.join((pat1, pat2,pat3,pat4))
    text = re.sub(combined_pat,"",text)
    return text

def clean_text(text): 
    text=remove_content(text)
    
    text = re.sub('[^A-Za-z\s]', '', text.lower())        #remove non-alphabets, but ignire
    tokenized_text = WordPunctTokenizer().tokenize(text) #tokenize
    clean_text = [
         word for word in tokenized_text
         if word not in STOPWORDS
    ]
    return ' '.join(clean_text)

#### run tween cleaner

In [None]:
Data_to_clean = sub_tweets["tweet"]
cleaned_list = []

for t in Data_to_clean:
    cleaned_list.append(clean_text(t))

In [None]:
##make a dataframe for sentiment analysis
clean_df = pd.DataFrame(cleaned_list,columns=['tweet'])
clean_df.head()

## Sentiment analysis 

In [None]:
for index, row in clean_df.iterrows():
    tweet = clean_df.at[index, 'tweet']

    #run sentiment using TextBlob
    analysis = TextBlob(tweet)

    #set value to dataframe
    clean_df.at[index, 'polarity'] = analysis.sentiment[0]
    clean_df.at[index, 'subjectivity'] = analysis.sentiment[1]


    #Create Positive / negative column depending on polariity
    if analysis.sentiment[0]>0:

        clean_df.at[index, 'Sentiment'] = "Positive"
        #printmd('Positive', color="green")

    elif analysis.sentiment[0]<0:

        clean_df.at[index, 'Sentiment'] = "Negative"
    else:

        clean_df.at[index, 'Sentiment'] = "Neutral"

In [None]:
clean_df.to_csv("cleaned.csv")
clean_df.head()

## merge the dataframes - to get date 

In [None]:
Final_df = clean_df.merge(sub_tweets, left_index=True, right_index=True)

In [None]:
Final_df = Final_df.rename(columns={"tweet_x": "cleaned_tweet", "tweet_y": "Base_tweet"})
Final_df.to_csv("check.csv")
Final_df.head()

In [None]:
#ensure date is seen as datetime
Final_df["date"] = pd.to_datetime(Final_df["date"])

#set index = date so as to create rolling and expanding mean 
Final_df.index = pd.to_datetime(Final_df['date'])

In [None]:
Final_df['mean'] = Final_df['polarity'].expanding().mean()

In [None]:
Final_df['rolling'] = Final_df['polarity'].rolling("1d").mean()

In [None]:
Final_df[["polarity","mean","rolling"]].head(10)

### Check distribution

In [None]:
fig = plt.figure(figsize=(15,7))
sns.histplot(Final_df['polarity'])
plt.show()

In [None]:
##### remove neutral values 
#Final_df = Final_df[Final_df.polarity != 0]

# Word Cloud

In [None]:
tweetString = " ".join(list(Final_df["cleaned_tweet"])).lower()
tweetString1 = re.sub(r"standardbankza|standard bank|bank","",tweetString)     #remove bank name

wordcloud = WordCloud(width = 800, height = 500, 
                background_color ='white', 
                min_font_size = 5).generate(tweetString1)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0) 
plt.show()

In [None]:
Negative_df = Final_df[(Final_df.Sentiment=="Negative")]
NegativeString = " ".join(list(Negative_df["cleaned_tweet"])).lower()
tweetString1 = re.sub(r"standardbankza|standard bank|bank","",tweetString)     #remove bank name

wordcloud = WordCloud(width = 800, height = 500, 
                background_color ='white', 
                min_font_size = 5).generate(tweetString1)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0) 
plt.show()

In [None]:
NegativeString

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
ax.plot(Final_df['date'],Final_df['rolling'], color ='r', label='Rolling Mean')
ax.plot(Final_df['date'],Final_df['mean'], color='y', label='Expanding Mean')
#z= plt.plot(Final_df['date'],Final_df["polarity"])

plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)

#plot data 
ax.scatter(Final_df['date'],Final_df['polarity'], label='Sentiment')
ax.plot(Final_df['date'],Final_df['rolling'], color ='r', label='Rolling Mean')
ax.plot(Final_df['date'],Final_df['mean'], color='y', label='Expanding Mean')
ax.set(title='STD Bank Tweet Sentiment', xlabel='Date', ylabel='Sentiment')

# set font and rotation for date tick labels
plt.gcf().autofmt_xdate()

ax.legend(loc='best')
fig.tight_layout()

plt.show()

In [None]:
Final_df.to_csv("NLP_Standardbank_July.csv")

In [None]:
Negative_df = Final_df[(Final_df.Sentiment=="Negative")]

In [None]:
Negative_df.head()