In [None]:
import sys 
sys.path.append('..')
from utils.paths import make_dirs, make_dir, POTATOES_LIST, POTATOES_CLEANED_DIR, JSON_DIR_ASSETS, HASHTAGS_PER_USER_DIR, HASHTAG_FREQUENCY_DIR_ALL, CLEANED_DIR, HASHTAG_FREQUENCY_DIR, GREEDY_MODULARITY_SELECTED_DIR, HASHTAG_TOP_DIR, POSTS_DIR_JSON, CLUSTERS_DIR_JSON, HASHTAG_FREQUENCY_DIR_JSON
from utils.utils import load_dataframes, load_dataframe, write_dict_to_csv, write_list_to_csv, write_df_to_csv, read_result_csv
import pandas as pd
import os
from glob import glob
import json
import numpy as np
import math

make_dirs()

## CSV to JSON
- convert processed data to the format required by the interactive web application

In [None]:
def csv_to_json(fromDir, toDir, quarter):
    df = pd.read_csv(os.path.join(fromDir, quarter + ".csv"), index_col=0)
    result = df.to_json()

    with open(os.path.join(toDir, quarter + '.json'), 'w') as f:
        f.write(result)

### Hashtag frequency per district

In [None]:
def frequencies_to_json():
    for potato in POTATOES_LIST():
        csv_to_json(HASHTAG_FREQUENCY_DIR, HASHTAG_FREQUENCY_DIR_JSON, potato)

frequencies_to_json()

### Clusters and uniqueness per district

In [None]:
min_count = 10
res = '1.8'
max_n_hashtags_per_cluster = 15

def checkRow(x, hashtags):
    if x in hashtags:
        return True
    else:
        return False

path = os.path.join(GREEDY_MODULARITY_SELECTED_DIR, 'min_count_' + str(min_count) + '_res_' + res)

for quarter in POTATOES_LIST():
    path_quarter_clusters = os.path.join(path, quarter)
    files_quarter = sorted(glob(path_quarter_clusters + '/*_.csv'))
    df = load_dataframe(HASHTAG_TOP_DIR, quarter)
    df = df.drop(df['count_quarter'].idxmax())

    df['hashtag'] = df['hashtag'].astype(str)
    df['cluster'] = ""
    
    for file in files_quarter:
        df_cluster = pd.read_csv(file)
        hashtags = list(df_cluster['hashtag'])
        if len(hashtags) > 2:
            hashtags = hashtags[:max_n_hashtags_per_cluster]
            name = hashtags[0]
            mask = (df['hashtag'].apply(lambda x: checkRow(x, hashtags)))
            df['cluster'][mask] = name

    top_count = np.max(df['count_quarter'])
    df['count'] = df['count_quarter']
    df['uniqueness'] = np.round(df['uniqueness'], decimals=2)
    df['rank'] = np.round(df['rank'], decimals=2)
    df['radius'] = df['count']/math.pi
    df['radius'] = np.sqrt(df['radius'])
    df['radius'] = df['radius']/df['radius'].max()
    df = df[['hashtag', 'count', 'radius', 'uniqueness', 'cluster', 'rank']]

    out_path = os.path.join(CLUSTERS_DIR_JSON, quarter + '.json')
    df.to_json(out_path, orient='records')

### Posts per district

In [None]:
def posts_to_json():
    dir = POTATOES_CLEANED_DIR
    for quarter in POTATOES_LIST():
        path = os.path.join(dir, quarter + ".csv")
        df = pd.read_csv(path, index_col=0) 
        df.reset_index(drop=True, inplace=True)
        if (df['is_video'].dtype != bool) :
            df.replace({'is_video': {'True': True, 'False': False}}, inplace=True)

        df['is_video'] = df['is_video'].astype(bool)
        df = df[~df['is_video']]
        df = df.sort_values(by='likes', ascending=False)

        df.reset_index(drop=True, inplace=True)
        df = df[['post_url', 'hashtags', 'likes']]        
        result = df.to_json()

        with open(os.path.join(POSTS_DIR_JSON, quarter + '.json'), 'w') as f:
            f.write(result)

posts_to_json()

### Posts whole city

In [None]:
min_likes = 100

def posts_to_json_city():
    dir = POTATOES_CLEANED_DIR

    df_all = pd.DataFrame(columns=['post_url', 'hashtags', 'likes', 'is_video'])
    for quarter in POTATOES_LIST():
        path = os.path.join(dir, quarter + ".csv")
        df = pd.read_csv(path, index_col=0) 
        df.reset_index(drop=True, inplace=True)
        if (df['is_video'].dtype != bool) :
            df.replace({'is_video': {'True': True, 'False': False}}, inplace=True)
            
        df['is_video'] = df['is_video'].astype(bool)
        df = df[~df['is_video']]
        df_all = df_all.append(df)

    df = df.drop_duplicates(subset='post_url', keep="first")
    df_all['likes'] = df_all['likes'].astype(int)
    df_all = df_all.sort_values(by='likes', ascending=False)
    df_all.reset_index(drop=True, inplace=True)
    df_all.drop(df_all[df_all['hashtags'] == '[]'].index, inplace = True)
    df_all.drop(df_all[df_all['likes'] < min_likes].index, inplace = True)
    df_all = df_all[['post_url', 'hashtags', 'likes']]    
    result = df_all.to_json()

    with open(os.path.join(POSTS_DIR_JSON, 'city.json'), 'w') as f:
        f.write(result)

posts_to_json_city()

### Stats per district

In [None]:
def potato_list_count_users_stories():

    df_post_count = load_dataframe(CLEANED_DIR, 'post_counts')
    min_count = 10
    res = '1.8'

    df_post_count['stories'] = 0
    df_post_count['users'] = 0

    path_to_clusters = os.path.join(GREEDY_MODULARITY_SELECTED_DIR, 'min_count_' + str(min_count) + '_res_' + res)

    for quarter in POTATOES_LIST():
        path_quarter_clusters = os.path.join(path_to_clusters, quarter)
        files_quarter = sorted(glob(path_quarter_clusters + '/*_.csv'))

        stories_count = len(files_quarter)
        df_users = load_dataframe(HASHTAGS_PER_USER_DIR, quarter)
    
        users_count = len(df_users.hashtags)
        print(users_count)

        index = df_post_count.loc[df_post_count['quarter'] == quarter].index

        df_post_count['stories'][index] = stories_count
        df_post_count['users'][index] = users_count
        
        

    df_post_count['post_count'] = df_post_count['post_count_cleaned']

    df_post_count = df_post_count[['quarter','post_count','users', 'stories']]
    df_post_count.to_json(os.path.join(JSON_DIR_ASSETS, 'posts_users_stories.json'), orient='records')

potato_list_count_users_stories()

## Top hashtags city

In [None]:
min_count = 100

def checkRow(x, hashtags):
    if x in hashtags:
        return True
    else:
        return False

def count_all_quarters_to_json():
    df = load_dataframe(HASHTAG_FREQUENCY_DIR_ALL, 'count_all_quarters')
    df.drop(df[df['count'] < min_count].index, inplace = True)
    potatoes = POTATOES_LIST()
    potato_rows = df[df['hashtag'].apply(lambda x: checkRow(x, potatoes))]
    df.drop(df[df['hashtag'].apply(lambda x: checkRow(x, potatoes))].index, inplace = True)
    df = df.append(potato_rows)
    df = df[['hashtag']]
    df.to_json(os.path.join(JSON_DIR_ASSETS, 'top_hashtags_city.json'), orient='records')

count_all_quarters_to_json()