# Assignment I: Scraping, Twitter API & Pandas
## Name: Mouselinos Spyridon 
## Date: October 2019

## Part 1: Scraping Twitter Accounts


### 1.1 Connect to Twitter
***

In [None]:
### For the rest of the exercise we have successfully created a twitter account and stored our credentials/key-pairs in a file named twitter_config.py
### For the sake of this exercise the file is located in the same folder as this notebook.

In [None]:
import tweepy
import time
from twitter_config import config

In [None]:
### Function to Establish Connection to Twitter Api

In [None]:
def establish_connection(config):
    auth = tweepy.OAuthHandler(config['consumer_key'], config['consumer_secret'])
    auth.set_access_token(config['access_token'], config['access_token_secret'])
    try:
        api = tweepy.API(auth)
    except:
        raise("Connection Not Established...")
    return api

In [None]:
api = establish_connection(config=config)

### Fetch the most trendy topics for Athens and print the first 10 (HINT: woeid 946738).

In [None]:
### Function to return N first Trends of a WOEID Location

In [None]:
def fetch_n_trends_for_location(api, woeid, first_n_results):
    trends_list = api.trends_place(id=woeid)[0]['trends'][0:first_n_results]
    for trend in trends_list:
        print(trend['name'])

In [None]:
fetch_n_trends_for_location(api=api, woeid=946738,first_n_results=10)

### 1.2 Username Scraping
***

### Scrape 10 usernames from [the 49 best Twitter accounts to follow in UK politics](https://www.businessinsider.com/uk-politics-twitter-accounts-2016-8?r=US&IR=T#48-matt-singh-2) and put them in a list.

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

In [None]:
### We will need to create a BS4 Scraper that will be used both in Question 1.2 as well as all following scraping Questions.
### The scrapper will take as input the link to scrap as well as the optional boolean argument selenium.
### For the sake of completeness i decided to add the scraper full functionality for both non-js / js loaded webpages.
### Note to user: In case the selenium flag is set to True, a path to the geckodriver executable is needed.

In [None]:
def retrieve_soup(link, selenium=False, path_to_driver=r'C:\\Program Files\\GeckoDriver\\geckodriver.exe'):
    if selenium:
            options = Options()
            # We dont need an actual Firefox window to be opened.
            options.headless = True
            driver = webdriver.Firefox(options=options, executable_path=path_to_driver)
            # We wait a little so the webpage is fully loaded. I used 30 seconds for this example.
            driver.implicitly_wait(30)
            driver.get(link)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
    else:
        try:
            r = requests.get(link)
        except requests.exceptions.MissingSchema as ms:
            # Missing URL schema
            print(ms)
        except requests.exceptions.ConnectionError as ce:
            # Connection error
            print(ce)
        except requests.exceptions.HTTPError as herror:
            # Invalid HTTP response
            print(herror)
        except requests.exceptions.Timeout as toerr:
            # Timeout error
            print(toerr)
        else:
            print("Page retrieval OK")
            soup =  BeautifulSoup(r.content, 'html.parser')    
    return soup

In [None]:
### Now lets solve the exercize

In [None]:
soup = retrieve_soup("https://www.businessinsider.com/uk-politics-twitter-accounts-2016-8?r=US&IR=T")

In [None]:
### Real Names are pretty easy as they are hidden in the h2 Tag ###

In [None]:
name_list = [f.text.split('. ')[-1] for f in soup.find_all("h2")]

In [None]:
### The respective Twitter Usernames are in the class "slide-layout clearfix" as the first <a> tag ###

In [None]:
divs = soup.find_all("div", {"class":{"slide-layout clearfix"}})

### Note: Some of theese strings contain the \xa0 
### which needs to be removed explicitly as mentioned [here](https://stackoverflow.com/questions/10993612/python-removing-xa0-from-string)

In [None]:
twitter_username_list = [('@' + f.find('p').get_text().split('@')[-1].replace(u'\xa0', u'')) for f in divs]

In [None]:
### Now that we have the twiiter username list lets perform a sanity check
assert len(twitter_username_list) == len(name_list)

In [None]:
### Now lets get the top 10 of them in order to put them in a List
top_10_list = twitter_username_list[-10:]

In [None]:
top_10_list

### 1.3 Fetch Tweets
***

In [None]:
import tweepy, datetime, time

* Fetch the tweets posted in the last 24 hours for each of the usernames.

* Use `try`/`except` to bypass any one not present (e.g., deleted ones) and print "Cound not fetch @username" (where `username` is the name of the respective user).

* Use `datetime`, `timedelta` from `datetime` to define the last day.

* Use `tweepy.Cursor`, fetching no more than 100 tweets at a time.

* Use `api.user_timeline` to fetch the tweets, and check the Tweeter API on how [timelines](https://developer.twitter.com/en/docs/tweets/timelines/guides/working-with-timelines) work.

* Print the number of tweets you fetched for each username.

In [None]:
## The limit Handler you provided in lectures notes ##

def limit_handler(cursor):
    while True:
        try:
            yield cursor.next()
            # This is to avoid running into a rate limit
            time.sleep(60)
        except tweepy.RateLimitError:
            # But if we do run into a rate limit, then
            # go to sleep for 15 minutes
            print('Rate limit reached')
            time.sleep(15 * 60)
        except tweepy.TweepError as te:
            if te[0]['code'] == '419':
                print('Requests limit reached')
                time.sleep(15 * 60)
            else:
                raise

In [None]:
## Helper Functions ##
def check_tweet_time(tweet):
    ## Checks if a tweet happened until 24h ago.
    dt = (datetime.datetime.now() - tweet.created_at)
    if dt < datetime.timedelta(days=1):
        return True
    else:
        return False
    
def check_if_retweet_or_reply(tweet):
    if not tweet.retweeted and tweet.in_reply_to_status_id is None:
        return True
    else:
        return False

In [None]:
def get_tweets(api, screen_name):
    
    # Flag to Mark results from a specific ID and backwards to read.
    # Defaults to the most recent id. This avoids re-reading data due
    # to the adding of recent tweets to the tweet stack.
    max_id = None

    # Flag to Mark results from a specific ID and forward to read.
    # Defaults to None so the API can go as far back as it can.
    since_id = None
    
    ## The total number of collected tweets ##
    n_tweets = 0
    
    cursor = tweepy.Cursor(api.user_timeline, count=100, screen_name=screen_name).pages()
    
    page_n = 0
    for page in limit_handler(cursor):
        for tweet in page:
            if page_n == 0:
                since_id = tweet.id
            ## If the tweet is time eligible ##
            if check_tweet_time(tweet): 
                ## If the tweet is not a retweet count it ##
                if check_if_retweet_or_reply(tweet):
                    n_tweets += 1
            else:
                ## Update the max_id to the last seen tweet id ##
                max_id = tweet.id -1
                print("User {} had {} tweets in the past 24h".format(screen_name, n_tweets))
                return since_id, max_id, n_tweets
        ## Update the Page Number ##
        page_n += 1

In [None]:
def get_latest_tweets(api, screen_name, since_id, max_id, n_tweets):
    
    ## We will look only in the 1st page because come on how many tweets could one do in mere seconds? ##
    cursor = tweepy.Cursor(api.user_timeline, count=100, since_id=str(since_id), max_id=str(max_id), screen_name=screen_name).pages(1)
    
    latest_tweets = 0 
    
    for page in limit_handler(cursor):
        for tweet in page:
            ## If the tweet is time eligible ##
            if check_tweet_time(tweet): 
                ## If the tweet is not a retweet count it ##
                if check_if_retweet_or_reply(tweet):
                    latest_tweets += 1                                
            else:
                print("User {} had just made {} more tweets at a grand total of {}".format(screen_name,latest_tweets,latest_tweets+n_tweets))
                return

In [None]:
### We will keep a dictionary with the last and first processed tweets of each elibigle user ###
### Then perform a pass on them in order to find new tweets that happened while we processed the other users ##

In [None]:
retry_dict = {}
for tun in twitter_username_list:
    try:
        ## If user exists ##
        api.get_user(tun)
        ## Get his/her tweets ##
        since_id, max_id, n_tweets = get_tweets(api=api, screen_name=tun)
        retry_dict.update(
            {
                tun:[since_id, max_id, n_tweets]
            }
        )
    except:
        print("Could not fetch tweets for user: {}".format(tun))

In [None]:
for tun in retry_dict.keys():
    since_id = retry_dict[tun][0]
    max_id = retry_dict[tun][1]
    n_tweets = retry_dict[tun][2]
    try:
        get_latest_tweets(api=api, screen_name=tun, since_id=since_id, max_id=max_id, n_tweets=n_tweets)
    except:
        print("No new tweets for user: {}".format(tun))

## Part 2: Processing Twitter accounts

### 2.1 Scrape and Put in a `DataFrame`
***

In [None]:
import pandas as pd

#### Scrape the Twitter accounts of all the UK parliament members and put them in a `DataFrame`; you can get use the following resource: <https://www.mpsontwitter.co.uk/list>.

In [None]:
### We will use our function retrieve_soup that waas created above

In [None]:
soup = retrieve_soup("https://www.mpsontwitter.co.uk/list", selenium=False)

In [None]:
# Now we will fetch the table body from the table object that holds our data

In [None]:
table = soup.find('tbody',{'id':'mp_wrapper'})

In [None]:
## By inspecting the table we notice that the total number of accounts is 587 at the moment, however in order to be safe we keep a margin of 600 and then drop the N/A rows
## so we take advantage of that by inserting it as an index during the Frame Creation

#### Name the DataFrame columns as `name`, `username`, `constituency`, `party`, `num_followers`.

In [None]:
mp_dataframe = pd.DataFrame(columns=['name', 'username', 'constituency', 'party', 'num_followers'], index=range(0,600))

In [None]:
row_marker = 0
### For every row in the Table Body:
for row in table.find_all('tr'):
    
    column_marker = 0
    ### Find all Columns
    columns = row.find_all('td')
    ### And then for every column that we need,
    ### Meaning columns 3-5:
    ### Add it to the frame using our index
    for idx, column in enumerate(columns):
        if (idx < 2) or (idx > 6):
            continue
        mp_dataframe.iat[row_marker,column_marker] = column.get_text()
        ## Increase the column index
        column_marker += 1
    ## Increase the row index
    row_marker +=1

In [None]:
mp_dataframe.dropna(inplace=True)

#### Make sure that the followers_num is shown as a number, not a string.

In [None]:
### Curate num_followers into a number ###

In [None]:
mp_dataframe['num_followers'] = mp_dataframe['num_followers'].apply(lambda x: int(x.replace(",", "")))

### 2.2 Create a Hierarchical Index
***

#### The `party` will be the top level and `followers_num` will be the next.

#### Show only the `username` column (apart from the index).

In [None]:
### We can do this in 1 line:

In [None]:
mp_dataframe.set_index(['party','num_followers'])['username']

### 2.3 Assess the Party Tweeter Power
***

In [None]:
### We group the frame by the party column and aggregate the num_followers column by the sum function

In [None]:
plot_df = mp_dataframe.groupby('party').num_followers.sum()

In [None]:
### Then we plot it by BarPlot

In [None]:
plot_df.plot(kind='bar')

In [None]:
### In order to plot it in seaborn we have to do a trick first
### A good way i thought of was the pivot table method following the reset index so we drop the extra column created by the pivoting
### Then the data are in the format needed by seaborn to be plotted

In [None]:
pivot_df = mp_dataframe.pivot_table(index='party', aggfunc='sum')
pivot_df.reset_index(level=0, inplace=True)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(12,12))
plt.title("Bar plot of the tweeter power of each party")
chart = sns.barplot(x='party', y='num_followers',data=pivot_df)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45)
# Overwrite the Default Violin Plot axes
plt.xlabel("Party")
plt.ylabel("Number of Followers")
plt.show()
plt.close()

## Part 3: Processing Text Content

### 3.1 Prepare a Profanity Set
***

In [None]:
import os
import requests

In [None]:
def download(url):
    
    ###    Uses requests to get a remote file,
    ###    saves it in chunks and renames it 
    ###    according to the last '/' part of the link.
    
    get_response = requests.get(url,stream=True)
    file_name  = url.split("/")[-1]
    with open(file_name, 'wb') as f:
        for chunk in get_response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    return file_name

In [None]:
file_name = download('http://staffwww.dcs.shef.ac.uk/people/G.Gorrell/publications-materials/abuse-terms.txt')

### Now we have the file abuse-terms.txt in our folder

In [None]:
### Lets filter out the wordlist

In [None]:
import re

In [None]:
# First we open the file
with open(file_name, 'r') as f:
   # Read the file contents and generate a list with each line
   lines = f.readlines()

In [None]:
# The first 25 lines are junk

In [None]:
lines = lines[24:]

In [None]:
def bad_word_finder(lines):
    bad_words = set()
    pattern = re.compile(r'[a-zA-Z]+[\s\w-]+[\n\t]+')
    # Iterate each line
    for line in lines:
        # Regex applied to each line 
        match = re.search(pattern, line)
        if match:
            # Drop the last char (\n or \t)
            bad_words.add(match.group()[:-1].lower())
    return bad_words

In [None]:
bad_words = bad_word_finder(lines)

### 3.2 Parse Tweets
***

In [None]:
import pandas as pd

#### Read tweets from <https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv> and rename the `tweet` column to `text`.

In [None]:
url = 'https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv'

In [None]:
data = pd.read_csv(url, usecols=['count','hate_speech','offensive_language','neither','class','tweet'])

In [None]:
data.rename(columns={"tweet": "text"}, inplace=True)

In [None]:
### Convert all of them to lower for avoiding mismatching errors

In [None]:
data['text'] = data.text.str.lower()

#### Create a new column with the list of words of the each text, placed in a Python list.

In [None]:
data['word_list'] = data.text.str.findall(r'[a-zA-Z]+')

In [None]:
### We dont need text column anymore
del data['text']

### 3.3 Count Abuse
***

In [None]:
### Lets find out the number of bad words per entry

In [None]:
data['n_bad_words'] = data['word_list'].apply(set).apply(lambda x: len(x & bad_words))

In [None]:
### And then the bad words themselves in each entry

In [None]:
data['bad_words'] = data['word_list'].apply(set).apply(lambda x: list(x & bad_words))

In [None]:
## We dont need the word lists any more
del data['word_list']

In [None]:
#### We create a statistics dictionary that holds information about each required statistic to be returned
#### This serves as a placeholder to be filled in afterwards

In [None]:
statistics = {
    0 : {
        'minimum' : None,
        'maximum' : None,
        'mean': None, 
        'median': None,
        'sum': None
    },
    
    1 : {
        'minimum' : None,
        'maximum' : None,
        'mean': None, 
        'median': None,
        'sum': None
    },
    
    2 : {
        'minimum' : None,
        'maximum' : None,
        'mean': None, 
        'median': None,
        'sum': None
    }
}

In [None]:
for class_id, frame in data.groupby(by='class'):
    statistics[class_id]['minimum'] = frame['n_bad_words'].min()
    statistics[class_id]['maximum'] = frame['n_bad_words'].max()
    statistics[class_id]['mean'] = frame['n_bad_words'].mean()
    statistics[class_id]['median'] = frame['n_bad_words'].median()
    statistics[class_id]['sum'] = frame['n_bad_words'].sum()

#### Find the mean, median, minimum, maximum, and sum of bad words in each class.

In [None]:
pd.DataFrame.from_dict(statistics)

### 3.4 Visualize Profanity 
***

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

### Method 1 DistPlot

#### Dist Plot does not support data separation by hue so we have to do it manually

##### Hint for this solution was found here: https://stackoverflow.com/questions/46045750/python-distplot-with-multiple-distributions

In [None]:
# Slice the DataFrame into 3 class based Dataframes
class_0_df = data.loc[data['class'] == 0]
class_1_df = data.loc[data['class'] == 1]
class_2_df = data.loc[data['class'] == 2]

plt.figure(figsize=(12,12))
plt.xlabel("Number of Bad Words")
plt.ylabel("Density")
plt.title("Distibution Plot of Bad Words in Each Class")
sns.distplot(class_0_df[['n_bad_words']], hist=False, label='Class 0')
sns.distplot(class_1_df[['n_bad_words']], hist=False, label='Class 1')
sns.distplot(class_2_df[['n_bad_words']], hist=False, label='Class 2')
plt.legend(ncol=3, fontsize=15)
plt.show()
plt.close()

### Method 2 ViolinPlot

In [None]:
plt.figure(figsize=(12,12))
plt.title("Violin Plot of Bad Words in Each Class")
sns.violinplot(x="class", y="n_bad_words", data=data, hue="class")
# Overwrite the Default Violin Plot axes
plt.xlabel("Class")
plt.ylabel("Density")
# Overwrite the Default Legend Size and Shape
plt.legend(ncol=3, fontsize=15)
plt.show()
plt.close()

### Method 3 BoxPlot

In [None]:
plt.figure(figsize=(12,12))
plt.title("Box Plot of Bad Words in Each Class")
sns.boxplot(x="class", y="n_bad_words", data=data, hue="class")
# Overwrite the Default Violin Plot axes
plt.xlabel("Class")
plt.ylabel("Density")
# Overwrite the Default Legend Size and Shape
plt.legend(ncol=3, fontsize=15)
plt.show()
plt.close()

### Method 4 StripPlot

In [None]:
plt.figure(figsize=(12,12))
plt.title("Box Plot of Bad Words in Each Class")
sns.stripplot(x="class", y="n_bad_words", data=data, hue="class")
# Overwrite the Default Violin Plot axes
plt.xlabel("Class")
plt.ylabel("Density")
# Overwrite the Default Legend Size and Shape
plt.legend(ncol=3, fontsize=15)
plt.show()
plt.close()

### 3.4 Explore Profanity per Class
***

### First we need to reduce the total bad words to 3 lists one for each class

In [None]:
from collections import Counter
import functools
import operator

In [None]:
## Remove rows without bad words
data = data.loc[data['n_bad_words'] > 0]

In [None]:
total_bad_words = {
    0: None,
    1: None,
    2: None
}

In [None]:
top_30 = {
    0: None,
    1: None,
    2: None
}

#### Note: Fastest way to convert a list of lists into a single list
#### See: https://stackoverflow.com/a/45323085

In [None]:
def collect_bad_words_for_class(series):
    def functools_reduce_iconcat(a):
        return functools.reduce(operator.iconcat, a, [])
    return functools_reduce_iconcat(list(series.values))

In [None]:
def find_top_n_words(words,n=30):
    return Counter(words).most_common(n)

In [None]:
for class_id, frame in data.groupby(by='class'):
    total_bad_words[class_id] = collect_bad_words_for_class(frame['bad_words'])
    top_30[class_id] = find_top_n_words(total_bad_words[class_id])

In [None]:
for i in range(0,3):
    print("Top 30 for class: {}".format(i))
    print(top_30[i])
    print("*---------------------*")

### We can see that a lot of theese words stem from the same word, maybe we could create a mapping to merge the similar ones and recalculate the top 30

In [None]:
rectify_dict = {
    'nigga'   : 'nigger',
    'niggers' : 'nigger',
    'negro'   : 'nigger',
    'niglet'  : 'nigger',
    'fag'     : 'faggot',
    'bitches' : 'bitch',
    'pussies' : 'pussy',
}

### Apply it to "all words collected" dictionaries and recalculate results

In [None]:
for i in range(0,3):
    total_bad_words[i] = [f if f not in rectify_dict.keys() else rectify_dict[f] for f in total_bad_words[i]]
    top_30[i] = find_top_n_words(total_bad_words[i])

In [None]:
for i in range(0,3):
    print("Top 30 for class: {}".format(i))
    print(top_30[i])
    print("*---------------------*")

In [None]:
## Why not create a nice word count blob for visualization ##
## Warning: this part of the exercize needs the library wordcloud  to be installed ##
## I made the following line to be installed from the notebook if needed ##

In [None]:
try:
    from wordcloud import WordCloud
except:
    import sys
    !{sys.executable} -m pip install wordcloud
    print("Might need to reload Jupyter Kernel to Continue...")

#### Code inspired from https://www.pythoncircle.com/post/689/python-script-16-generating-word-cloud-image-of-a-text-using-python/

In [None]:
from wordcloud import WordCloud

# image configurations
background_color = "#101010"
height = 720
width = 1080


for class_n in range(0,3):
    print("Class: {}".format(class_n))
    word_cloud = WordCloud(
        background_color=background_color,
        width=width,
        height=height
    )

    word_cloud.generate_from_frequencies(dict(top_30[class_n]))
    # Display the generated image:
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()