In [5]:
import sys 
sys.path.append('..')
from utils.paths import make_dirs, QUARTERS_DICT, HASHTAGS_PER_USER_DIR, HASHTAG_FREQUENCY_DIR, HASHTAG_FREQUENCY_DIR_ALL, HASHTAG_TOP_DIR
from utils.utils import load_dataframes, load_dataframe, write_dict_to_csv, write_list_to_csv, write_df_to_csv
import pandas as pd
import os
from nltk.probability import FreqDist
import numpy as np
import math

make_dirs()

min_count = 10

In [2]:
def group_by_user(df):
    df = df.groupby('owner_id').agg(
        hashtags=pd.NamedAgg(column='hashtags', aggfunc='sum'), 
        post_count=pd.NamedAgg(column='shortcode', aggfunc='count'),
    )
    df = df.sort_values(by=['post_count'], ascending=False)
    df['hashtags'] = df['hashtags'].apply(lambda x: list(set(x)))
    return df

In [3]:
def count_hashtag_frequency(df):
    hashtags = list(df['hashtags'])
    flat_list = [item.lower() for sublist in hashtags for item in sublist]    
    fdist = FreqDist(flat_list)
    return fdist.most_common()

## Count hashtags in every district
- Group hashtags by user in order to give every user one "vote" per hashtag
- count absolute and relative amount 
- drop hashtags that are mentioned by less than [min_count] people

In [4]:
dict = QUARTERS_DICT()
for quarter in dict:
    print('>> now processing', quarter)
    df = dict[quarter]
    df = group_by_user(df)
    write_df_to_csv(df, quarter, HASHTAGS_PER_USER_DIR)
    fdist = count_hashtag_frequency(df)
    df = pd.DataFrame(fdist, columns=['hashtag', 'count'])
    df.drop(df[df['count'] < min_count].index, inplace = True)
    count_quarter = df.iloc[0]['count']
    df['relative_amount'] = df['count'] / count_quarter
    write_df_to_csv(df, quarter, HASHTAG_FREQUENCY_DIR)

>> now loading: allach
>> now loading: altperlach
>> now loading: altstadt
>> now loading: amhart
>> now loading: arabellapark
>> now loading: au
>> now loading: aubing
>> now loading: bergamlaim
>> now loading: bogenhausen
>> now loading: borstei
>> now loading: daglfing
>> now loading: denning
>> now loading: fasanerie
>> now loading: fasangarten
>> now loading: feldmoching
>> now loading: forstenried
>> now loading: freiham
>> now loading: freimann
>> now loading: fröttmaning
>> now loading: fürstenried
>> now loading: giesing
>> now loading: hadern
>> now loading: haidhausen
>> now loading: harlaching
>> now loading: harras
>> now loading: harthof
>> now loading: hasenbergl
>> now loading: hellabrunn
>> now loading: herzogpark
>> now loading: isarvorstadt
>> now loading: johanneskirchen
>> now loading: kieferngarten
>> now loading: kirchtrudering
>> now loading: laim
>> now loading: lehel
>> now loading: lerchenau
>> now loading: lochhausen
>> now loading: ludwigsvorstadt
>> now lo

## Count hashtags in the whole city
- Same as above, but for posts downloaded from all districts

In [6]:
df = pd.DataFrame()
dict = QUARTERS_DICT()
for quarter in dict:
    df = df.append(dict[quarter])
    print('>> now processing', quarter)


df = df.drop_duplicates(subset='post_url', keep="first")
df = group_by_user(df)
user_count = len(df)

fdist = count_hashtag_frequency(df)
df = pd.DataFrame(fdist, columns=['hashtag', 'count'])
df.drop(df[df['count'] < min_count].index, inplace = True)
df['relative_amount'] = df['count'] / user_count

write_df_to_csv(df,'count_all_quarters', HASHTAG_FREQUENCY_DIR_ALL)

>> now loading: allach
>> now loading: altperlach
>> now loading: altstadt
>> now loading: amhart
>> now loading: arabellapark
>> now loading: au
>> now loading: aubing


KeyboardInterrupt: 

## Calculate uniqueness and rank of hashtags for every district
- uniqueness as ratio between relative amount of a hashtag in a district and in the whole city
- rank as amount * uniqueness^2 , as only uniqeness results in obscure hashtags

In [9]:
def get_uniqueness(dfs, df_all, out_dir):
    idx = 1

    for quarter in dfs:
        print ('file no:', idx)
        idx += 1
        print ('>> now processing:', quarter)
        df = dfs[quarter]
        df.set_index('hashtag')

        df = df.join(df_all.set_index('hashtag'), on='hashtag', lsuffix='_quarter', rsuffix='_city')
        df['uniqueness'] = df['relative_amount_quarter'] / df['relative_amount_city']
        df['rank'] = df['count_quarter'] * df['uniqueness'] * df['uniqueness']
        df.sort_values(by=['rank'], ascending=False, inplace=True)
        write_df_to_csv(df, quarter, out_dir)
    return

In [10]:
dfs = load_dataframes(HASHTAG_FREQUENCY_DIR)
df_all = load_dataframe(HASHTAG_FREQUENCY_DIR_ALL, 'count_all_quarters')

get_uniqueness(dfs, df_all, HASHTAG_TOP_DIR)

>> now loading: allach
>> now loading: altperlach
>> now loading: altstadt
>> now loading: amhart
>> now loading: arabellapark
>> now loading: au
>> now loading: aubing
>> now loading: bergamlaim
>> now loading: bogenhausen
>> now loading: borstei
>> now loading: daglfing
>> now loading: denning
>> now loading: fasanerie
>> now loading: fasangarten
>> now loading: feldmoching
>> now loading: forstenried
>> now loading: freiham
>> now loading: freimann
>> now loading: fröttmaning
>> now loading: fürstenried
>> now loading: giesing
>> now loading: hadern
>> now loading: haidhausen
>> now loading: harlaching
>> now loading: harras
>> now loading: harthof
>> now loading: hasenbergl
>> now loading: hellabrunn
>> now loading: herzogpark
>> now loading: isarvorstadt
>> now loading: johanneskirchen
>> now loading: kieferngarten
>> now loading: kirchtrudering
>> now loading: laim
>> now loading: lehel
>> now loading: lerchenau
>> now loading: lochhausen
>> now loading: ludwigsvorstadt
>> now lo