### Black Panther Tweets Sentiment Anaysis

The Analysis uses the twint library to mine tweets with keywords BlackPanther, WakandaForever, BlackPanther2, and Black Panther. 
The tweets will be analysed and preprocessed to make it clean(free of emojis, punctuations etc). The Vader Sentiment library will be used for the sentiment analysis and the analysis are visualized and communicated better in power BI.

In [None]:
#importing necessary libraries
import twint
import nest_asyncio
import pandas as pd
import regex as re
import preprocessor as p
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from collections import Counter

In [None]:
nest_asyncio.apply()
c = twint.Config()
c.Search = "BlackPanther OR WakandaForever OR BlackPanther2 OR Black Panther" # topic
c.Limit = 2000000 # number of Tweets to scrape
c.Lang= "en"
c.Store_csv = True # store tweets in a csv file
c.Output = "tweets.csv" # path to csv file
twint.run.Search(c)

In [None]:
df=pd.read_csv('tweets.csv') #reading in tweets

In [None]:
df.info() #columns inspection

In [None]:
#dropping columns I won't be needing
df=df.drop(['trans_dest','trans_src','translate',"retweet_date",'retweet_id','user_rt','user_rt_id','source','geo','near','quote_url','hashtags'], axis=1)

In [None]:
df.isna().sum() #checking for columns with missing data

In [None]:
df.dropna(inplace=True,axis=1) #dropping missing columns

In [None]:
df=df[['id','date','time','username','tweet','retweets_count','likes_count','retweet']] #selecting only necessary columns

In [None]:
df=df.reset_index() #reseting index

In [None]:
df=df.drop(["index"], axis=1) #dropping the old index

In [None]:
    """ 
    This function removes all hashtags found in tweets
    tweet: string
    a tweet that consists of hashtags to be cleaned
    
    
    returns 
    -------
    tweet: string
    a tweet without hastags
    
    """
def hashtag_removal(tweet):
    tweet=tweet.lower()
    patterns=re.findall("#[\w]*",tweet)
    for i in patterns:
        tweet=tweet.replace(i,'')
    return tweet
    

In [None]:
#appying the hashtag_removal function to the tweet column 
df['clean_tweet']=df['tweet'].apply(hashtag_removal)

In [None]:
#using the tweet preprocessor library to get rid of emojis
df['clean_tweet']=df['clean_tweet'].apply(p.clean)

In [None]:
    """ 
    This function removes all punctuations specified in the function from the tweets
    r: string
    a tweet that consists of punctuations to be cleaned
    
    
    returns 
    -------
    r: string
    a tweet without punctuations
    
    """
def punctuation_removal(r):
    patterns=re.findall(r'&(\w+);', r)
    for i in patterns:
        r=r.replace("&{i};","")
    punc = '''!()-[]{};:'""\,<>./?@#$%^&*_~'''
    for ele in r:
        if ele in punc:
            r = r.replace(ele, "")
    return r
df['clean_tweet']=df['clean_tweet'].apply(punctuation_removal)

In [None]:
#tokenizing and lemmatizing each words 
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = TweetTokenizer()
def lemmatize_text(text):
    return [(lemmatizer.lemmatize(w)) for w in w_tokenizer.tokenize((text))]
df['tokenized_tweet'] = df['clean_tweet'].apply(lemmatize_text)

In [None]:
stop_words = set(stopwords.words('english'))
df['tokenized_tweet'] = df['tokenized_tweet'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
    """ 
    This function creates a sentiment analysis for each tweet
    tweet: string
    a tweet that sentiment analysis needs to be performed on
    
    
    returns 
    -------
    positive: string
    if the negative score is less than positive
    
    negative: string
    if the negative score is greater than positive
    
    neutral: string
    if both above condition is not met.
    
    """
def sentiment_analyzer(tweet):
    sentiment= SentimentIntensityAnalyzer()
    score= sentiment.polarity_scores(tweet)
    if score['neg']<score['pos']:
        return "positive"
    elif score['neg']> score['pos']:
        return "negative"
    else:
        return "neutral"

In [None]:
#appying the function to the clean tweet
df['Sentiments']=df['clean_tweet'].apply(sentiment_analyzer)

In [None]:
#list of casts I want to search for
casts=["tchalla",'shuri','nakia','okoye',"m'Baku","riri","aneka","namor","chadwick","tenoch","letitia","ramonda","angela","mabel",
      "michaela","danai","lupita","domique","winston"]

In [None]:
    """ 
    This function gets the casts names that are both in cast list and the tweet 
    tweet: string
    a tweet
    
    
    returns 
    -------
    []:a list 

    a list of cast names present in the tweet
    
    """

def getcast(tweet):
    BP_cast = [char for char in casts if char in tweet] 
    return " ".join(BP_cast)

In [None]:
df['cast']=df["clean_tweet"].apply(getcast)

In [None]:
cast_list = df['cast'].tolist()

# Iterate over all cast names and split where there is more than one cast
cast = []
for item in cast_list:
    item = item.split()
    for i in item:
        cast.append(i)

# Determine Unique count of all cast
counts = Counter(cast)
cast_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
cast_df.columns = ['cast', 'Count']
cast_df.sort_values(by='Count', ascending=False, inplace=True)
cast_df.head(10)

In [None]:
#saving the new dataframe to a csv file
df.to_csv('clean_tweets.csv', index=False)