# Notebook to engineer Dictionary features

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import liwc

from scripts import init_dict_dataframe, get_dict

import sys
sys.path.insert(0, '../../')
from utils import get_corr_pval

### Importing IMDs and dictionary words

In [None]:
tokens = pd.read_csv("../../data/dictionary/london.csv")['tokens'].tolist()

In [None]:
imd_per_ward = pd.read_csv("../../data/imd_per_ward.csv")[['WD17CD','Index of Multiple Deprivation (IMD) Score','Education, Skills and Training Score','Employment Score (rate)','Income Score (rate)']]
imd_per_ward = imd_per_ward.rename(columns={"Index of Multiple Deprivation (IMD) Score": "IMD", "Education, Skills and Training Score" : "IMD_Edu", 'Employment Score (rate)' : 'IMD_Emp', 'Income Score (rate)': 'IMD_Inc'})

### London

In [None]:
london_descriptions = pd.read_csv("../../data/airbnb_listings_description/london_listings_description_ward.csv")[['full_description','ward']]
london_descriptions_per_ward = london_descriptions.groupby('ward', as_index=False).agg(lambda x: list(x))

In [None]:
rows = []
for i in range(london_descriptions_per_ward.shape[0]):
    if (len(london_descriptions_per_ward['full_description'][i]) < 5):
        rows.append(i)
london_descriptions_per_ward = london_descriptions_per_ward.drop(rows).reset_index().drop(['index'], axis=1)

In [None]:
london_dict_per_ward = london_descriptions_per_ward.copy()

In [None]:
london_dict_per_ward = init_dict_dataframe(london_dict_per_ward, tokens)
london_dict_per_ward = get_dict(london_dict_per_ward, tokens)

### Merging the cities

In [None]:
dict_per_ward = pd.concat([london_dict_per_ward])

In [None]:
dict_imds_per_ward = dict_per_ward.merge(imd_per_ward, left_on="ward", right_on="WD17CD").drop(columns=['WD17CD'])

In [None]:
to_remove = ['ward','full_description','IMD','IMD_Edu','IMD_Emp','IMD_Inc']
cols = list(dict_imds_per_ward.columns)
for i in to_remove:
    cols.remove(i)

In [None]:
(df_corr, df_pval) = get_corr_pval(dict_imds_per_ward, cols, ["IMD", "IMD_Edu", "IMD_Emp", "IMD_Inc"])

In [None]:
fig, ax = plt.subplots(figsize=(15,100))
sns.heatmap(df_pval, annot=True,  annot_kws={'va':'top','fontsize':'small','c':'white'}, cbar=False)
sns.heatmap(df_corr, annot=True, annot_kws={'va':'bottom'})

### Filtering the features (words)

In [None]:
to_remove = []
for i in tokens:
    if (df_pval.loc[i]['IMD'] > 0.05 and df_pval.loc[i]['IMD_Edu'] > 0.05 and df_pval.loc[i]['IMD_Emp'] > 0.05 and df_pval.loc[i]['IMD_Inc'] > 0.05):
        to_remove.append(i)

### Output to .csv file

In [None]:
cols_out = ['full_description','IMD','IMD_Edu','IMD_Emp','IMD_Inc'] + to_remove
final_dict = dict_imds_per_ward.copy().drop(cols_out, axis=1).reset_index(drop=True)

In [None]:
final_dict.to_csv("../../data/london_dict.csv", index=False)