# Notebook to engineer Style features

In [None]:
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

from scripts import get_basic_metrics, get_all_readability_metrics, get_pos_metrics, get_wl_metrics, get_spelling_mistakes_metrics

import sys
sys.path.insert(0, '../')
from utils import get_corr_pval

### Importing descriptions and IMDs

In [None]:
descriptions = pd.read_csv("../data/airbnb_listings_description/london_listings_description_ward.csv")[['full_description','ward']]
descriptions_per_ward = descriptions.groupby('ward', as_index=False).agg(lambda x: list(x))

In [None]:
rows = []
for i in range(descriptions_per_ward.shape[0]):
    if (len(descriptions_per_ward['full_description'][i]) < 5):
        rows.append(i)
descriptions_per_ward = descriptions_per_ward.drop(rows).reset_index().drop(['index'], axis=1)

In [None]:
imd_per_ward = pd.read_csv("../data/imd_per_ward.csv")[['WD17CD','Index of Multiple Deprivation (IMD) Score','Education, Skills and Training Score','Employment Score (rate)','Income Score (rate)']]
imd_per_ward = imd_per_ward.rename(columns={"Index of Multiple Deprivation (IMD) Score": "IMD", "Education, Skills and Training Score" : "IMD_Edu", 'Employment Score (rate)' : 'IMD_Emp', 'Income Score (rate)': 'IMD_Inc'})

### Get basic metrics for each ward

In [None]:
basic_metrics_per_ward = descriptions_per_ward.copy()

In [None]:
basic_metrics_per_ward = get_basic_metrics(basic_metrics_per_ward)

In [None]:
basic_metrics_imds_per_ward = basic_metrics_per_ward.merge(imd_per_ward, left_on="ward", right_on="WD17CD").drop(columns=['WD17CD'])

In [None]:
(df_corr, df_pval) = get_corr_pval(basic_metrics_imds_per_ward, ["char_len", "sent_count", "word_count"], ["IMD", "IMD_Edu", "IMD_Emp", "IMD_Inc"])

In [None]:
fig, ax = plt.subplots(figsize=(10,3))
sns.heatmap(df_pval, annot=True,  annot_kws={'va':'top','fontsize':'small','c':'white'}, cbar=False)
sns.heatmap(df_corr, annot=True, annot_kws={'va':'bottom'})

### Get readability metrics for each ward

In [None]:
read_metrics_per_ward = descriptions_per_ward.copy()

In [None]:
read_metrics_per_ward = get_all_readability_metrics(read_metrics_per_ward)

In [None]:
read_metrics_imds_per_ward = read_metrics_per_ward.merge(imd_per_ward, left_on="ward", right_on="WD17CD").drop(columns=['WD17CD'])

In [None]:
(df_corr, df_pval) = get_corr_pval(read_metrics_imds_per_ward, ["CLI", "ARI", "GFI", "SMOG", "DCRI", "FKRI"], ["IMD", "IMD_Edu", "IMD_Emp", "IMD_Inc"])

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
sns.heatmap(df_pval, annot=True,  annot_kws={'va':'top','fontsize':'small','c':'white'}, cbar=False)
sns.heatmap(df_corr, annot=True, annot_kws={'va':'bottom'})

### Get POS metrics for each ward

In [None]:
pos_metrics_per_ward = descriptions_per_ward.copy()

In [None]:
pos_metrics_per_ward = get_pos_metrics(pos_metrics_per_ward)

In [None]:
pos_metrics_imds_per_ward = pos_metrics_per_ward.merge(imd_per_ward, left_on="ward", right_on="WD17CD").drop(columns=['WD17CD'])

In [None]:
(df_corr, df_pval) = get_corr_pval(pos_metrics_imds_per_ward, ["CC_freq", "DT_freq", "IN_freq", "JJ_freq", "VB_freq", "NN_freq", "RB_freq", "EX_freq", "PO_freq", "CD_freq"], ["IMD", "IMD_Edu", "IMD_Emp", "IMD_Inc"])

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(df_pval, annot=True,  annot_kws={'va':'top','fontsize':'small','c':'white'}, cbar=False)
sns.heatmap(df_corr, annot=True, annot_kws={'va':'bottom'})

### Get word-level metrics for each ward

In [None]:
wl_metrics_per_ward = descriptions_per_ward.copy()

In [None]:
wl_metrics_per_ward = get_wl_metrics(wl_metrics_per_ward)

In [None]:
wl_metrics_imds_per_ward = wl_metrics_per_ward.merge(imd_per_ward, left_on="ward", right_on="WD17CD").drop(columns=['WD17CD'])

In [None]:
(df_corr, df_pval) = get_corr_pval(wl_metrics_imds_per_ward, ["hapax_freq", "hapax_dis_freq", "yules_k", "brunet_w", "honore_r", "simpson"], ["IMD", "IMD_Edu", "IMD_Emp", "IMD_Inc"])

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
sns.heatmap(df_pval, annot=True,  annot_kws={'va':'top','fontsize':'small','c':'white'}, cbar=False)
sns.heatmap(df_corr, annot=True, annot_kws={'va':'bottom'})

### Get spelling mistakes metrics for each ward

In [None]:
spe_metrics_per_ward = descriptions_per_ward.copy()

In [None]:
spe_metrics_per_ward = get_spelling_mistakes_metrics(spe_metrics_per_ward, "../")

In [None]:
spe_metrics_imds_per_ward = spe_metrics_per_ward.merge(imd_per_ward, left_on="ward", right_on="WD17CD").drop(columns=['WD17CD'])

In [None]:
(df_corr, df_pval) = get_corr_pval(spe_metrics_imds_per_ward, ["spelling_mistakes_freq"], ["IMD", "IMD_Edu", "IMD_Emp", "IMD_Inc"])

In [None]:
fig, ax = plt.subplots(figsize=(10,1))
sns.heatmap(df_pval, annot=True,  annot_kws={'va':'top','fontsize':'small','c':'white'}, cbar=False)
sns.heatmap(df_corr, annot=True, annot_kws={'va':'bottom'})

### Output a selection of the metrics to .csv

In [None]:
total_metrics = basic_metrics_per_ward.merge(read_metrics_per_ward.drop('full_description',axis=1),on="ward").merge(pos_metrics_per_ward.drop('full_description',axis=1),on="ward").merge(wl_metrics_per_ward.drop('full_description',axis=1),on="ward").merge(spe_metrics_per_ward.drop('full_description',axis=1),on="ward").drop('full_description',axis=1)

In [None]:
# Use a selection of features here
final_metrics = total_metrics[["ward", "sent_count", "word_count", "CLI", "ARI", "GFI", "SMOG", "DCRI", "FKRI", "CC_freq", "DT_freq", "IN_freq", "JJ_freq", "VB_freq", "NN_freq", "RB_freq", "EX_freq", "PO_freq", "CD_freq", "hapax_freq", "hapax_dis_freq", "yules_k", "brunet_w", "honore_r", "spelling_mistakes_freq"]]

In [None]:
final_metrics.to_csv("../data/london_metrics.csv", index=False)