# Project to predict the popularity of Political leaders and detect hate speech and tribalism from Facebook and Twitter comments

## Imports

In [51]:
from webdriver_manager.chrome import ChromeDriverManager

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import BeautifulSoup

import pandas as pd

import time

# for tweets collection
import tweepy

# text cleaning
import preprocessor as p

# text preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet

#lemmatization
from nltk.stem import WordNetLemmatizer

#sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag

## visualization
import matplotlib.pyplot as plt
%matplotlib inline

# get the language of the text
import langdetect
#for ner recognition
import spacy

from langdetect.lang_detect_exception import LangDetectException

# GET FACEBOOK COMMENTS

In [27]:
# scroll down
def scroll():
    pop_out_btn = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "popup_xout")))
    pop_out_btn.click()

In [35]:
# funtion to click the view more button
def view_more_click(driver):
    try: 
        view_more_comments = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "View more comments…")))
        view_more_comments.click()
    
    except TimeoutException:
        view_more_comments = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "View previous comments…")))
        view_more_comments.click()
        
    except ElementClickInterceptedException:
        scroll()
    
    except (NoSuchElementException,ElementClickInterceptedException, StaleElementReferenceException) as es :
        print(es)
    html = driver.page_source
    return html

# function to click the button 50 times
def view_more_comm(driver):
    i = 0
    while i < 50:
        html = view_more_click(driver)
        time.sleep(3)
        i += 1
    return html

In [4]:
# function to get a list of all the comment elements
def get_comments(html):
    soup = BeautifulSoup(html, 'html.parser')
    comment_section = soup.find("div", {"class":"_59e9 _1-ut _2a_g _34oh" })
    comment_els = comment_section.find_all("div", {"class":"_2a_i"})
    
    master_list = []
    for c in comment_els:
        data_dict = {}
        data_dict["User"] = c.find("div", {"class":"_2b05"}).text
        data_dict["Comment"] = c.find("div", {"data-sigil":"comment-body"}).text
        master_list.append(data_dict)
    return master_list

    
    

# GETTING TWITTER DATA

## Twitter Authentication

In [5]:
# read the keys from the file 
def authorize_twitter():
    keys = []
    with open('../Keys.txt') as f:
        for line in f:
            keys.append(line.strip())

    API_KEY = keys[1]
    API_KEY_SECRET = keys [4]
    ACCESS_TOKEN = keys[10]
    ACCESS_TOKEN_SECRET = keys[13]
    
    # initialize the api
    auth = tweepy.OAuthHandler(API_KEY, API_KEY_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    api = tweepy.API(auth)
    
    return (api)

## Get the replies from the posts

In [6]:
def get_replies(url):
    name = url.split("/")[-3]
    tweet_id = url.split("/")[-1]

    #empty list for the replies
    replies = []
    
    api = authorize_twitter()
    # get replies from the tweet
    for tweet in tweepy.Cursor(api.search_tweets,q='to:'+name, result_type='recent').items(1000):
        if hasattr(tweet, 'in_reply_to_status_id_str'):
            if (tweet.in_reply_to_status_id_str==tweet_id):
                replies.append(tweet)
    
    #master list to hold all the data needed
    master_list = []
    for reply in replies:
        data_dict = {}
        data_dict["User"] = reply.author.screen_name
        data_dict["Comment"] = reply.text

        master_list.append(data_dict)
        
    return (master_list)

# TEXT PREPROCESSING

## Tokenization and POS tagging + removing stop words

In [7]:
#perform tokenization and pos tagging and remove stop words
pos_dict = {'J': wordnet.ADJ, 'v': wordnet.VERB, 'N': wordnet.NOUN, 'R': wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
            
    return newlist

## Lemmatization

In [8]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew


## Language detection

In [49]:
def get_language(text):
    try:
        lang = langdetect.detect(text)
        
        return lang
    except LangDetectException:
        print(text, "is invalid")

## SENTIMENT ANALYSIS USING VADER

In [9]:
def vaderSentimentAnalysis(text):
    analyzer = SentimentIntensityAnalyzer()
    vs= analyzer.polarity_scores(text)
    return vs['compound']

# function to analyse 
def vader_analysis(compound):
    if compound > 0:
        return 'Positive'
    elif compound < 0 :
        return 'Negative'
    else:
        return 'Neutral'

## Get names and parties from the text

In [10]:
def get_parties(text):
    for i in range(len(text)):
        parties=[]
        tags = text[i][1]
        if tags == "POLITICAL_PARTY":
            party = text[i][0]
            if party not in parties:
                parties.append(party)
        else:
            continue
        return parties

In [11]:
def get_names(text):
    names=[]
    for i in range(len(text)):
        tags = text[i][1]
        if tags == "POLITICIAN":
            name = text[i][0]
            if name not in names:
                names.append(name)
        else:
            continue
    return names

# MAIN FILE

In [36]:

choice = input("Please select the choice of your data. \n 1. for facebook. \n 2. For twitter")
if choice == "1":
    # install webdriver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    url = input("Paste the Facebook mobile url: ")
    driver.get(url)

    print("Getting comments please wait...\n")
    html = view_more_comm(driver)

    master_list = get_comments(html)

    print(str(len(master_list)) + " Comments found")
        
else:
    url = input("Paste the tweet url: ")
    print("----Getting replies please wait------")
    master_list = get_replies(url)
    print(str(len(master_list)) + " Comments found") 


Please select the choice of your data. 
 1. for facebook. 
 2. For twitter1




Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [C:\Users\NYAM\.wdm\drivers\chromedriver\win32\99.0.4844.51\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


Paste the Facebook mobile url: https://mobile.facebook.com/story.php?story_fbid=10160054567584430&id=88201339429&m_entstream_source=timeline
Getting comments please wait...

1274 Comments found


In [37]:
    #create a dataframe and clean the text
df = pd.DataFrame(master_list)
df["clean_tweets"] = df["Comment"].apply(lambda x: p.clean(x))
df

Unnamed: 0,User,Comment,clean_tweets
0,Jibril Iron Shaffi,It must be a special helicopter just like the ...,It must be a special helicopter just like the ...
1,Fieldmarshal Newton Mwaura,"Tell that to the birds, very ignorant post and...","Tell that to the birds, very ignorant post and..."
2,Khalif Kairo,Media loves overblowing stuff😅 Blades pekee fo...,Media loves overblowing stuff Blades pekee for...
3,Grace Waithira,"Insurance does not pay for Campaign incidents,...","Insurance does not pay for Campaign incidents,..."
4,Mutaih Vinneous,Si waliiba condoms pale KEMSA. Wauze na watumi...,Si waliiba condoms pale KEMSA. Wauze na watumi...
...,...,...,...
1269,Alex Yang Muembu,Stephen Mukangai can do it at 20k..🏃✊🇰🇪,Stephen Mukangai can do it at k..
1270,Sambula De King,Kaka Wilfred 😁😁😁😁 ...,Kaka Wilfred ...
1271,Okemwa Anderson,Dorothy Makabe I will surprise you with the co...,Dorothy Makabe I will surprise you with the co...
1272,Kago Mburu,Mbuguss Mwalimu kapisaa,Mbuguss Mwalimu kapisaa


## Text Preprocessing

In [38]:
df['POS tagged'] = df['clean_tweets'].apply(lambda x: token_stop_pos(x))
df

Unnamed: 0,User,Comment,clean_tweets,POS tagged
0,Jibril Iron Shaffi,It must be a special helicopter just like the ...,It must be a special helicopter just like the ...,"[(must, None), (special, a), (helicopter, n), ..."
1,Fieldmarshal Newton Mwaura,"Tell that to the birds, very ignorant post and...","Tell that to the birds, very ignorant post and...","[(Tell, None), (birds, n), (,, None), (ignoran..."
2,Khalif Kairo,Media loves overblowing stuff😅 Blades pekee fo...,Media loves overblowing stuff Blades pekee for...,"[(Media, n), (loves, None), (overblowing, None..."
3,Grace Waithira,"Insurance does not pay for Campaign incidents,...","Insurance does not pay for Campaign incidents,...","[(Insurance, n), (pay, None), (Campaign, n), (..."
4,Mutaih Vinneous,Si waliiba condoms pale KEMSA. Wauze na watumi...,Si waliiba condoms pale KEMSA. Wauze na watumi...,"[(Si, n), (waliiba, n), (condoms, n), (pale, a..."
...,...,...,...,...
1269,Alex Yang Muembu,Stephen Mukangai can do it at 20k..🏃✊🇰🇪,Stephen Mukangai can do it at k..,"[(Stephen, n), (Mukangai, n), (k, n), (.., n)]"
1270,Sambula De King,Kaka Wilfred 😁😁😁😁 ...,Kaka Wilfred ...,"[(Kaka, n), (Wilfred, None), (..., None)]"
1271,Okemwa Anderson,Dorothy Makabe I will surprise you with the co...,Dorothy Makabe I will surprise you with the co...,"[(Dorothy, n), (Makabe, n), (surprise, None), ..."
1272,Kago Mburu,Mbuguss Mwalimu kapisaa,Mbuguss Mwalimu kapisaa,"[(Mbuguss, n), (Mwalimu, n), (kapisaa, n)]"


In [39]:
# Lemmatization
df['Lemma'] = df['POS tagged'].apply(lambda x: lemmatize(x))
df.head()

Unnamed: 0,User,Comment,clean_tweets,POS tagged,Lemma
0,Jibril Iron Shaffi,It must be a special helicopter just like the ...,It must be a special helicopter just like the ...,"[(must, None), (special, a), (helicopter, n), ...",must special helicopter like standard gauge ...
1,Fieldmarshal Newton Mwaura,"Tell that to the birds, very ignorant post and...","Tell that to the birds, very ignorant post and...","[(Tell, None), (birds, n), (,, None), (ignoran...","Tell bird , ignorant post costing narrative"
2,Khalif Kairo,Media loves overblowing stuff😅 Blades pekee fo...,Media loves overblowing stuff Blades pekee for...,"[(Media, n), (loves, None), (overblowing, None...",Media loves overblowing stuff Blades pekee c...
3,Grace Waithira,"Insurance does not pay for Campaign incidents,...","Insurance does not pay for Campaign incidents,...","[(Insurance, n), (pay, None), (Campaign, n), (...","Insurance pay Campaign incident , , Stone th..."
4,Mutaih Vinneous,Si waliiba condoms pale KEMSA. Wauze na watumi...,Si waliiba condoms pale KEMSA. Wauze na watumi...,"[(Si, n), (waliiba, n), (condoms, n), (pale, a...",Si waliiba condom pale KEMSA . Wauze na watu...


## Get only english text

In [53]:
df['language'] = df['clean_tweets'].apply(lambda x: get_language(x) if x.strip()!="" else "")
df = df[df['language']=='en'].reset_index(drop=True)
df.drop('language', axis=1)
df

Unnamed: 0,User,Comment,clean_tweets,POS tagged,Lemma,language
0,Jibril Iron Shaffi,It must be a special helicopter just like the ...,It must be a special helicopter just like the ...,"[(must, None), (special, a), (helicopter, n), ...",must special helicopter like standard gauge ...,en
1,Fieldmarshal Newton Mwaura,"Tell that to the birds, very ignorant post and...","Tell that to the birds, very ignorant post and...","[(Tell, None), (birds, n), (,, None), (ignoran...","Tell bird , ignorant post costing narrative",en
2,Khalif Kairo,Media loves overblowing stuff😅 Blades pekee fo...,Media loves overblowing stuff Blades pekee for...,"[(Media, n), (loves, None), (overblowing, None...",Media loves overblowing stuff Blades pekee c...,en
3,Grace Waithira,"Insurance does not pay for Campaign incidents,...","Insurance does not pay for Campaign incidents,...","[(Insurance, n), (pay, None), (Campaign, n), (...","Insurance pay Campaign incident , , Stone th...",en
4,Richie Nzeki,"Arrest those who threw stones, present them in...","Arrest those who threw stones, present them in...","[(Arrest, n), (threw, None), (stones, n), (,, ...","Arrest threw stone , present court , pay cos...",en
...,...,...,...,...,...,...
614,Ôsmân KE,I can do the repairs at 5 million. Inbox assap...,I can do the repairs at million. Inbox assap ..,"[(repairs, n), (million, None), (., None), (In...",repair million . Inbox assap ..,en
615,Kiprono Ruthson,"BBI stipends and allowance irudi kwa economy, ...","BBI stipends and allowance irudi kwa economy, ...","[(BBI, n), (stipends, n), (allowance, n), (iru...","BBI stipend allowance irudi kwa economy , wi...",en
616,Nyongo Mkalia Ini,I repair broken helicopter windscreens at nego...,I repair broken helicopter windscreens at nego...,"[(repair, None), (broken, a), (helicopter, n),...",repair broken helicopter windscreen negotiab...,en
617,Richard Muthoni,Let them Consort me I will fix it at a very co...,Let them Consort me I will fix it at a very co...,"[(Let, None), (Consort, None), (fix, None), (c...",Let Consort fix considerable friendly price ...,en


## Geta clean dataframe for text analysis

In [54]:
fin_data= pd.DataFrame(df[['Comment', 'Lemma']])

## Sentiment Analysis

In [55]:
fin_data['Vader_Sentiment'] = fin_data['Comment'].apply(lambda x: vaderSentimentAnalysis(x))
fin_data['Vader_Analysis'] = fin_data['Vader_Sentiment'].apply(lambda x: vader_analysis(x))
fin_data[1:50]

Unnamed: 0,Comment,Lemma,Vader_Sentiment,Vader_Analysis
1,"Tell that to the birds, very ignorant post and...","Tell bird , ignorant post costing narrative",-0.3384,Negative
2,Media loves overblowing stuff😅 Blades pekee fo...,Media loves overblowing stuff Blades pekee c...,0.7351,Positive
3,"Insurance does not pay for Campaign incidents,...","Insurance pay Campaign incident , , Stone th...",-0.6254,Negative
4,"Arrest those who threw stones, present them in...","Arrest threw stone , present court , pay cos...",-0.8271,Negative
5,"How now, and the windscreen wasn't even armoured?",", windscreen n't even armoured ?",0.0,Neutral
6,"If the insurance company is refusing to pay, t...","insurance company refusing pay , deep state .",-0.4767,Negative
7,Most of these insurance companies are fake. T...,insurance company fake . even belong fellows...,-0.4767,Negative
8,The INSURANCE company were refusing to PAY for...,INSURANCE company refusing PAY replacement s...,-0.656,Negative
9,Actually it's more costly to repair V8 bullet ...,Actually 's costly repair V8 bullet proof wi...,0.204,Positive
10,They should withdraw a bit of the KEMSA money ...,withdraw bit KEMSA money stole handle minor ...,-0.7096,Negative


In [56]:
vd_counts = fin_data["Vader_Analysis"].value_counts()
vd_counts

Neutral     285
Negative    185
Positive    149
Name: Vader_Analysis, dtype: int64

## Named Entity Recognition

In [57]:
nlp = spacy.load("political_ner_model")
fin_data["tags"] = df["clean_tweets"].apply(lambda x: [(tag.text, tag.label_) for tag in nlp(x).ents])
fin_data[1:50]

Unnamed: 0,Comment,Lemma,Vader_Sentiment,Vader_Analysis,tags
1,"Tell that to the birds, very ignorant post and...","Tell bird , ignorant post costing narrative",-0.3384,Negative,[]
2,Media loves overblowing stuff😅 Blades pekee fo...,Media loves overblowing stuff Blades pekee c...,0.7351,Positive,"[(Blades, POLITICIAN)]"
3,"Insurance does not pay for Campaign incidents,...","Insurance pay Campaign incident , , Stone th...",-0.6254,Negative,"[(UDA, POLITICAL_PARTY), (Maintain, POLITICIAN)]"
4,"Arrest those who threw stones, present them in...","Arrest threw stone , present court , pay cos...",-0.8271,Negative,"[(Arrest, POLITICAL_PARTY)]"
5,"How now, and the windscreen wasn't even armoured?",", windscreen n't even armoured ?",0.0,Neutral,[]
6,"If the insurance company is refusing to pay, t...","insurance company refusing pay , deep state .",-0.4767,Negative,[]
7,Most of these insurance companies are fake. T...,insurance company fake . even belong fellows...,-0.4767,Negative,[]
8,The INSURANCE company were refusing to PAY for...,INSURANCE company refusing PAY replacement s...,-0.656,Negative,[]
9,Actually it's more costly to repair V8 bullet ...,Actually 's costly repair V8 bullet proof wi...,0.204,Positive,"[(WSR, POLITICIAN)]"
10,They should withdraw a bit of the KEMSA money ...,withdraw bit KEMSA money stole handle minor ...,-0.7096,Negative,[]


## Get names and political parties

In [58]:
fin_data["Parties"] = fin_data["tags"].apply(lambda x: get_parties(x))
fin_data["Politicians"] = fin_data["tags"].apply(lambda x: get_names(x))
fin_data[50:100]

Unnamed: 0,Comment,Lemma,Vader_Sentiment,Vader_Analysis,tags,Parties,Politicians
50,"If all chopper is 300M or even below that, how...","chopper even , screen , tell bird",0.0,Neutral,[],,[]
51,Endeleeni kuongea mambo ya helicopter badala y...,Endeleeni kuongea mambo ya helicopter badala...,0.2023,Positive,"[(Ruto, POLITICIAN), (Raila, POLITICIAN)]",,"[Ruto, Raila]"
52,The helicopter can be sold to scrape dealers t...,helicopter sold scrape dealer beyond repair .,0.0,Neutral,[],,[]
53,That's a lie and anyway who cares . Where is t...,'s lie anyway cares . fuel wacheni ujinga ....,0.4588,Positive,[],,[]
54,"they had it in mind when they were doing it, l...","mind , let replace air shield , hyo syo guru...",0.0,Neutral,[],,[]
55,Non of our business use kemsa stolen billions ...,Non business use kemsa stolen billion purcha...,-0.4939,Negative,[],,[]
56,A very good excuse to use your long forgotten ...,good excuse use long forgotten paybill number,0.489,Positive,[],,[]
57,If BBI each MCA was given 2m car grant to pass...,BBI MCA given car grant pass . drop ocean,0.1027,Positive,[],,[]
58,"You are talking of helicopter, while some tax ...","talking helicopter , tax payer going hungry",0.0,Neutral,[],,[]
59,And how much costs Kenyans whose businesses ge...,much cost Kenyans whose business get destroy...,-0.1531,Negative,"[(Raila, POLITICIAN)]",,[Raila]


In [60]:
poll_counts = fin_data["Politicians"][0].value_counts()
poll_counts

AttributeError: 'list' object has no attribute 'value_counts'