In [None]:
#%%

#!/usr/bin/env python
# coding: utf-8
import datetime
import pandas as pd
import numpy as np
import networkx as nx
pd.options.display.max_columns = None
pd.options.display.max_rows = None
from tqdm import tqdm
tqdm.pandas()
from scipy import sparse
import csv
import pickle
from collections import Counter, OrderedDict
import os
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import statistics
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats


In [None]:

def calculate_stats(new_df, categoryname,textname,split_char=' '):
    categories = new_df[categoryname].unique()
    
    all_lengths = []
    per_category = {
        'lengths': {c:[] for c in categories},
        'mean': {c:0 for c in categories},
        'stdev': {c:0 for c in categories}
    }

    for index, row in new_df.iterrows():
        text = row[textname]
        text = re.sub(r"\s+", ' ', text) # Normalize
        text = text.split(split_char)
        l = len(text)
        
        category = row[categoryname]
        
        all_lengths.append(l)
        per_category['lengths'][category].append(l)
    
    for c in categories:
        per_category['mean'][c] = statistics.mean(per_category['lengths'][c])
        per_category['stdev'][c] = statistics.stdev(per_category['lengths'][c])
    
    global_stats = {
        'mean': statistics.mean(all_lengths),
        'stdev': statistics.stdev(all_lengths),
        'lengths': all_lengths
    }
    
    return {
        'global': global_stats,
        'per_category': pd.DataFrame(per_category)
    }


def display_lengths_histograms(df_stats, categoryname,n_cols=4):
    categories = new_df[categoryname].unique()
    n_rows = math.ceil(len(categories) / n_cols)
    
    plt.figure(figsize=(15, 8))
    plt.suptitle('Distribution of lengths')
    
    # Subplot of all lengths
    plt.subplot(n_rows, n_cols, 1)
    plt.title('All categories')
    lengths = df_stats['global']['lengths']
    plt.hist(lengths, color='r')

    # Subplot of each category
    index_subplot = 2
    for c in categories:
        plt.subplot(n_rows, n_cols, index_subplot)
        plt.title('Category: %s' % c)
        
        lengths = df_stats['per_category']['lengths'][c]
        plt.hist(lengths, color='b')

        index_subplot += 1

    plt.show()
    
def plot_categories(df,columnname,):
    categories = df[[columnname]].values.reshape(-1)
    counter_categories_cnt = Counter(categories)
    counter_categories = OrderedDict(counter_categories_cnt.most_common())
    category_names = counter_categories.keys()
    category_values = counter_categories.values()

    y_pos = np.arange(len(category_names))

    plt.figure(1, figsize=(10, 5))
    plt.bar(y_pos, category_values, align='center', alpha=0.5)
    plt.xticks(y_pos, category_names)
    #plt.ylabel('Number of tweets')
    plt.title('Distribution of tweets per language')
    plt.gca().yaxis.grid(True)
    plt.show()
    print(counter_categories)
    return counter_categories


def print_info_df(df_final_in):
    """
    print information
    """
    df_final_in_user = df_final_in.drop_duplicates(subset=['retweet_user'], keep='last')
    print('number of retweet users',len(df_final_in_user)) 
    df_final_in_org_user = df_final_in.drop_duplicates(subset=['org_user'], keep='last')
    print('number of orignal users',len(df_final_in_org_user))
    df_final_in_tweet = df_final_in.drop_duplicates(subset=['org_id'], keep='last')
    print('number of orignal tweets',len(df_final_in_tweet))
    print('number of tweets',len(df_final_in))
    
def swb_calucation(df):
    swb_dict = {}
    for i in tqdm(df['user_id']):
        a = df[df['user_id'] == i]
        #print(a)
        P = len(a[a['new_senti_label'] == 'Positive'])
        #print(P)
        N = len(a[a['new_senti_label'] == 'Negative'])
        #print(N)
        Ne = len(a[a['new_senti_label'] == 'Neutral'])
        #print(Ne)
        SWB = ((P - N)/(P + N))*cmath.sqrt((P+N)/(P+N+Ne))
        # print(SWB)
        swb_dict[str(i)] = SWB
    return swb_dict

def read_file(path,file_name):
    df = pd.read_pickle(path + 'cleaned-'+file_name)
    df['text_id'] = df['text_id'].map(lambda x :str(x))
    df['retweet_id'] = df['retweet_id'].map(lambda x :str(x))
    df['retweet_user_id'] = df['retweet_user_id'].map(lambda x :str(x))
    df['retweet_text'] = df['retweet_text'].map(lambda x :str(x))
    final_text = []
    for i in tqdm(range(len(df))):
        if df.iloc[i]['retweet_status'] == '0':
            final_text.append(df.iloc[i]['text'])
        if df.iloc[i]['retweet_status'] == '1':
            final_text.append(df.iloc[i]['retweet_text'])
    df['final_text'] = final_text
    org_id = []
    for i in tqdm(range(len(df))):
        if df.iloc[i]['retweet_status'] == '0':
            org_id.append(df.iloc[i]['text_id'])
        if df.iloc[i]['retweet_status'] == '1':
            org_id.append(df.iloc[i]['retweet_id'])
    df['org_id'] = org_id
    df = df.drop_duplicates(subset = ['org_id'], keep='last').reset_index(drop=True)
    print(len(df))
    
    return df
