In [1]:
import os
import pandas as pd
import copy
import numpy as np
import holoviews.plotting.mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import re
import hvplot.pandas
from holoviews import opts
import holoviews as hv
import warnings

warnings.filterwarnings('ignore')
hv.extension('bokeh')

# Possible Users Messurment
## \begin{equation}\label{eq:}
\frac{n+1}{n}\cdot\underset{i=1}{\overset{n}{\sum}}\frac{1}{i\cdot(i+1)}\cdot log^{2}(\frac{y_{i}}{a}\cdot i^{\alpha}))
\end{equation}


# read file methods

In [2]:
# taking a dataframe with the data, and traform it to numeric 
# (object (hashtag/user/node), Counter (--needs to be an integer--))
def only_numeric(df):
    df = df[df.amount.apply(lambda x: str(x).strip().isnumeric())]
    df.loc[:, 'amount'] = df.amount.astype(int)
    return df

# traranspormation to dataframe
def normal_open(text):
    lst = open(text, 'r', encoding='utf-8').read().split('\n')
    df = pd.DataFrame([p.strip().split(' ', 1) for p in lst], columns=['amount', 'text'])
    return only_numeric(df)

# transforms to dataframe ----- manin function ---
def txt2df(text):
    try:
        tbl = pd.read_csv(text, sep='\t')
        if tbl.columns.__len__() < 2:
            try:
                return normal_open(text) # normal open
            except:
                if text == 'D:\\graphs\\project\\disasters_vs_politics\\disasters\\dorian\\dorian-media.txt':
                    tbl = pd.read_csv(text, header=None, sep=' ', names=['amount', 'text'])
                    return pd.DataFrame(tbl.amount.value_counts().reset_index(), columns=['index', 'amount']).rename({'index':'text'})
                else:
                    tbl = pd.read_csv(text, sep=' ', header=None, names=['id', 'text']).id.value_counts().reset_index()
                    tbl.columns = ['text', 'amount']
                    return tbl
        else:
            if text == 'D:\\graphs\\project\\disasters_vs_politics\\disasters\\florence\\HurricaneFlorence-media.txt':
                tbl = pd.read_csv(text, sep='\t', names=['id', 'text']).id.value_counts().reset_index()
                tbl.columns = ['text', 'amount']
                return tbl
            else:
                return pd.read_csv(text, sep='\t', names=['amount', 'text'])
    except:
        return normal_open(text)

# Hashtag Distributions Disasters

# disasters twitter data from 
https://digital.library.unt.edu/ark:/67531/metadc1706014/m1/ - dorian

https://digital.library.unt.edu/ark:/67531/metadc1259406/m1/?q=twitter - HurricaneFlorence

https://digital.library.unt.edu/ark:/67531/metadc993940/m1/?q=twitter - HurricaneHarvey

https://digital.library.unt.edu/ark:/67531/metadc1706012/m1/?q=twitter - imelda

https://digital.library.unt.edu/ark:/67531/metadc1477117/m1/?q=tweets - notre-dame-fire

https://digital.library.unt.edu/ark:/67531/metadc991469/m1/?q=tweets - dallas_police_shooting

In [5]:
# disasters txt data files.
paths_disasters_hashtags = {
        'dallas_police_shooting':
            f'{os.getcwd()}\\disasters\\dallas_police_shooting_hashtags.txt'
    ,
        'dorian':
            f'{os.getcwd()}\\disasters\\dorian-hashtags.txt'
    ,
        'florence':
            f'{os.getcwd()}\\disasters\\HurricaneFlorence-hashtags.txt'
    ,
        'harvey':
            f'{os.getcwd()}\\disasters\\HurricaneHarvey_hashtags.txt'
    ,
        'imelda':
            f'{os.getcwd()}\\disasters\\imelda-hashtags.txt'
    ,
        'notre_dam_fire':
            f'{os.getcwd()}\\disasters\\notre-dame-fire-hashtags.txt'
    }

In [6]:
def pre_processing_simple(filename):
    df = txt2df(filename)
    df = df.reset_index()
    df =df.rename(columns={'index':'hashtag_index'})
    df.hashtag_index = df.hashtag_index+1
    return df


def preprocessing_complex(filename):
    df = txt2df(filename)
    df = df.loc[1:]
    df.amount = df.amount.astype(int)
    df = df.sort_values(ascending=False, by=['amount'])
    df['text'] = list(range(1,df.shape[0]+1))
    df =df.rename(columns={'text':'hashtag_index'})
    return df


def plot_regular(name, bins=25):
    fileharvey = paths_disasters_hashtags.get(name)
    df = pre_processing_simple(fileharvey)

    logx = df.hashtag_index.apply(lambda x:  np.log10(x))
    logy = df.amount.apply(lambda x:  np.log10(x))
    
    df_bins = pd.concat([pd.cut(logx, bins=bins), logy], axis=1)
    df_bins['indexh'] = logx
    df_bins = df_bins.groupby('hashtag_index').mean()

    logx_fit = df_bins.indexh.to_numpy()
    logy_fit = df_bins.amount.to_numpy()
    
    coeffs_harvey = np.polyfit(logx_fit, logy_fit, deg=1)
    df['linear_fit'] = 10**(coeffs_harvey[0]*logx + coeffs_harvey[1])
    
    n  = df.shape[0]
    const_weight = (n+1)/n
    df['weights'] = const_weight/((df.hashtag_index)*(df.hashtag_index+1))
    
    pola_score = sum(df.weights*(np.log10((df.hashtag_index**coeffs_harvey[0])*df.amount/(10**coeffs_harvey[1]))**2))
    print('PLMSE Score: ', round(pola_score,3), 'disaster: ', name)
    
    coeffs_harvey = np.polyfit(logx_fit, logy_fit, deg=1)
    
    coeffs_harvey = np.polyfit(logx, logy, deg=1)
    df['linear_fit'] = 10**(coeffs_harvey[0]*logx + coeffs_harvey[1])

    scattered_data_plot = df.hvplot.scatter(x='hashtag_index',
                                            y=['amount'],
                                            loglog=True,
                                            title=name,
                                            label='slope: '+str(round(coeffs_harvey[0], 3)),
                                            xlim=(1, 10**7), 
                                            ylim=(1, 10**7),
                                            legend='top_right')
    line_fit = df.hvplot.line(x='hashtag_index', y=['linear_fit'], loglog=True)

    return scattered_data_plot * line_fit

def plot_complex(name, bins = 20):
    fileharvey = paths_disasters_hashtags.get(name)
    df = preprocessing_complex(fileharvey)

    logx = df.hashtag_index.apply(lambda x:  np.log10(x))
    logy = df.amount.apply(lambda x:  np.log10(x))
    
    df_bins = pd.concat([pd.cut(logx, bins=bins), logy], axis=1)
    df_bins['indexh'] = logx
    df_bins = df_bins.groupby('hashtag_index').mean()

    logx_fit = df_bins.indexh.to_numpy()
    logy_fit = df_bins.amount.to_numpy()
    
    coeffs_harvey = np.polyfit(logx_fit, logy_fit, deg=1)
    df['linear_fit'] = 10**(coeffs_harvey[0]*logx + coeffs_harvey[1])
    
    n  = df.shape[0]
    const_weight = (n+1)/n
    df['weights'] = const_weight/((df.hashtag_index)*(df.hashtag_index+1))
    
    pola_score = sum(df.weights*(np.log10((df.hashtag_index**coeffs_harvey[0])*df.amount/(10**coeffs_harvey[1]))**2))
    print('PLMSE Score: ', round(pola_score,3), 'disaster: ', name)
    
    coeffs_harvey = np.polyfit(logx_fit, logy_fit, deg=1)
    
    coeffs_harvey = np.polyfit(logx, logy, deg=1)
    df['linear_fit'] = 10**(coeffs_harvey[0]*logx + coeffs_harvey[1])

    scattered_data_plot = df.hvplot.scatter(x='hashtag_index',
                                            y=['amount'],
                                            loglog=True,
                                            title=name,
                                            label='slope: '+str(round(coeffs_harvey[0], 3)),
                                            xlim=(0.1, 10**7), 
                                            ylim=(0.1, 10**7),
                                            legend='top_right')
    line_fit = df.hvplot.line(x='hashtag_index', y=['linear_fit'], loglog=True)

    return scattered_data_plot * line_fit

In [7]:
# this function plots the Hashtags distribution (fig 2. for disasters)
p = plot_complex('florence') 
#p = plot_regular('dorian')
p # if the image seems a bit off, just drag the image to the right.

PLMSE Score:  2.062 disaster:  florence


In [10]:
# plot inside the jupyter notebook
p = plot_regular('dorian')
# save image as png
hv.renderer('bokeh').save(p.options(toolbar=None), 'dorian', fmt='png')

# plot inside the jupyter notebook
p = plot_complex('dallas_police_shooting')
# save image as png
hv.renderer('bokeh').save(p.options(toolbar=None), 'dallas_police_shooting', fmt='png')

# plot inside the jupyter notebook
p = plot_complex('florence')
# save image as png
hv.renderer('bokeh').save(p.options(toolbar=None), 'florence', fmt='png')

# plot inside the jupyter notebook
p = plot_complex('harvey')
# save image as png
hv.renderer('bokeh').save(p.options(toolbar=None), 'harvey', fmt='png')

# plot inside the jupyter notebook
p = plot_complex('imelda')
# save image as png
hv.renderer('bokeh').save(p.options(toolbar=None), 'imelda', fmt='png')

# plot inside the jupyter notebook
p = plot_complex('notre_dam_fire')
# save image as png
hv.renderer('bokeh').save(p.options(toolbar=None), 'notre_dam_fire', fmt='png')

2.237 dorian disaster
2.291 dallas_police_shooting disaster
2.062 florence disaster
2.291 harvey disaster
2.056 imelda disaster
1.689 notre_dam_fire disaster


#  Politics

In [42]:
paths_politics = {
        'usa_2016':
            f'{os.getcwd()}\\politics\\2016_US_election_tweets_100k.csv' # Data is found on the git repo
    ,
        'bangladesh_2019':
            f'{os.getcwd()}\\politics\\bangladesh_linked_tweets_csv_2019.csv' # Data is found on the git repo
    ,
        'catalonia_2006':
            f'{os.getcwd()}\\politics\\catalonia_201906_1_tweets_csv_hashed.csv' # Data is found on the git repo
    ,
        'china_2019_1':
            f'{os.getcwd()}\\politics\\china_082019_1_tweets_csv_election.csv' # Data is not on the git due to file size.
    ,
        'china_2019_2':
            f'{os.getcwd()}\\politics\\china_082019_2_tweets_csv_election.csv' # Data is not on the git due to file size.
    ,
        'ecuador_2019':
            f'{os.getcwd()}\\politics\\ecuador_082019_tweets_csv_hashed.csv' # Data is not on the git due to file size.
    ,
        'egypt_2019':
            f'{os.getcwd()}\\politics\\egypt_uae_082019_tweets_csv_hashed.csv' # Data is not on the git due to file size.
    ,
        'usa_2011_1':
            f'{os.getcwd()}\\politics\\elections_202011_p01.csv' # Data is not on the git due to file size.
    ,
        'usa_2011_2':
            f'{os.getcwd()}\\politics\\elections_202011_p02.csv' # Data is not on the git due to file size.
    ,
        'iran_2018':
            f'{os.getcwd()}\\politics\\iranian_tweets_csv_election2018.csv' # Data is not on the git due to file size.
    ,
        'russia_2019':
            f'{os.getcwd()}\\politics\\russian_linked_tweets_csv_2019.csv' # Data is not on the git due to file size.
    ,
        'usa_2011':
            f'{os.getcwd()}\\politics\\elections_202011_prt1prt2.csv' # Data is not on the git due to file size.
    }

# political twitter data from 
https://www.kaggle.com/paultimothymooney/twitter-election-data-archives

# Hashtag distributions politics

In [44]:
def get_hashtags_count(df):
    df.loc[:, 'Hashtags'] = df.loc[:, df.columns[0]].apply(lambda x: list(re.findall(r"#(\w+)", str(x))) if len(re.findall(r"#(\w+)", str(x)))>0 else None)
    df = df.dropna(subset=['Hashtags'], inplace=False)
    dd = pd.concat([pd.Series(row[df.columns[0]], row['Hashtags']) for _, row in df.iterrows()]).reset_index().drop_duplicates().rename(columns={0:'text', 'index':'hashtag'}) 
    result = dd.hashtag.value_counts().reset_index().rename(columns={'count':'amount', 'index':'hashtag_index'})
    return result

def get_politic_df(file):
    col_names_lst = ['Text', 'text', 'tweet', 'full_text', 'tweet_text', 'Tweet Text', 'Tweet_Text', 'Orig_Tweet', 'Tweet']
    for column in col_names_lst:
        try:
            df = pd.read_csv(file, usecols=[column])
            return get_hashtags_count(df)
        except:
            continue
            
def plot_politics(name, bins=20):
    fileharvey = paths_politics.get(name)
    df = get_politic_df(fileharvey)
    df['hashtag_index'] = list(range(1, df.shape[0]+1))
        
    logx = df.hashtag_index.apply(lambda x:  np.log10(x))
    logy = df.amount.apply(lambda x:  np.log10(x))
    
    df_bins = pd.concat([pd.cut(logx, bins=bins), logy], axis=1)
    df_bins['indexh'] = logx
    df_bins = df_bins.groupby('hashtag_index').mean()

    logx_fit = df_bins.indexh.to_numpy()
    logy_fit = df_bins.amount.to_numpy()
    
    coeffs_harvey = np.polyfit(logx_fit, logy_fit, deg=1)
    df['linear_fit'] = 10**(coeffs_harvey[0]*logx + coeffs_harvey[1])
    
    n  = df.shape[0]
    const_weight = (n+1)/n
    df['weights'] = const_weight/((df.hashtag_index)*(df.hashtag_index+1))        
    
    pola_score = sum(df.weights*(np.log10((df.hashtag_index**coeffs_harvey[0])*df.amount/(10**coeffs_harvey[1]))**2))
    print('PLMSE Score: ', round(pola_score,3), 'politics: ', name)
    
    coeffs_harvey = np.polyfit(logx_fit, logy_fit, deg=1)
    df['linear_fit'] = 10**(coeffs_harvey[0]*logx + coeffs_harvey[1])
    
    scattered_data = df.hvplot.scatter(x='hashtag_index',
                                            y=['amount'],
                                            loglog=True,
                                            title=name,
                                            label='slope: '+str(round(coeffs_harvey[0], 3)),
                                            xlim=(0.1, 10**7), 
                                            ylim=(0.1, 10**7),
                                            legend='top_right')
    line_fit = df.hvplot.line(x='hashtag_index', y=['linear_fit'], loglog=True)
    return scattered_data * line_fit


In [55]:
p = plot_politics('usa_2016', bins=20) 
p

PLMSE Score:  1.012 politics:  usa_2016


In [None]:
p = plot_politics('usa_2016')
hv.renderer('bokeh').save(p.options(toolbar=None), 'usa_2016', fmt='png')

p = plot_politics('bangladesh_2019', bins=15) 
hv.renderer('bokeh').save(p.options(toolbar=None), 'bangladesh_2019', fmt='png')

p = plot_politics('catalonia_2006', bins=15)
hv.renderer('bokeh').save(p.options(toolbar=None), 'catalonia_2006', fmt='png')

p = plot_politics('china_2019_1')
hv.renderer('bokeh').save(p.options(toolbar=None), 'china_2019_1', fmt='png')

p = plot_politics('china_2019_2')
hv.renderer('bokeh').save(p.options(toolbar=None), 'china_2019_2', fmt='png')

p = plot_politics('ecuador_2019')
hv.renderer('bokeh').save(p.options(toolbar=None), 'ecuador_2019', fmt='png')

p = plot_politics('egypt_2019')
hv.renderer('bokeh').save(p.options(toolbar=None), 'egypt_2019', fmt='png')

p = plot_politics('usa_2011_1')
hv.renderer('bokeh').save(p.options(toolbar=None), 'usa_2011_1', fmt='png')

p = plot_politics('usa_2011_2')
hv.renderer('bokeh').save(p.options(toolbar=None), 'usa_2011_2', fmt='png')

p = plot_politics('iran_2018')
hv.renderer('bokeh').save(p.options(toolbar=None), 'iran_2018', fmt='png')

p = plot_politics('russia_2019')
hv.renderer('bokeh').save(p.options(toolbar=None), 'russia_2019', fmt='png')

p = plot_politics('usa_2011')
hv.renderer('bokeh').save(p.options(toolbar=None), 'usa_2011', fmt='png')

1.012 usa_2016 disaster
1.378 bangladesh_2019 disaster
0.728 catalonia_2006 disaster
1.883 china_2019_1 disaster
1.214 china_2019_2 disaster
1.071 ecuador_2019 disaster
1.529 egypt_2019 disaster
1.92 usa_2011_1 disaster
2.041 usa_2011_2 disaster
1.118 iran_2018 disaster


In [21]:
pola_lst = [
    (1.012, 'usa_2016', 'politics'),
    (1.378, 'bangladesh_2019', 'politics'),
    (0.728, 'catalonia_2006', 'politics'),
    (1.883, 'china_2019_1', 'politics'),
    (1.214, 'china_2019_2', 'politics'),
    (1.071, 'ecuador_2019', 'politics'),
    (1.529, 'egypt_2019', 'politics'),
    (1.92, 'usa_2011_1', 'politics'),
    (2.041, 'usa_2011_2', 'politics'),
    (1.118, 'iran_2018', 'politics'),
    (2.237, 'dorian', 'disaster'),
    (2.291, 'dallas_police_shooting', 'disaster'),
    (2.062, 'florence', 'disaster'),
    (2.291, 'harvey', 'disaster'),
    (2.056, 'imelda', 'disaster'),
    (1.689, 'notre_dam_fire', 'disaster')
]

dd= pd.DataFrame(pola_lst, columns=['pola_score', 'event', 'type'])

In [22]:
dd.hvplot.scatter(x='event', y='pola_score', by='type', title='POLA-WSE-Hashtags',
                  legend='left', height=400, width=400, rot=90)