In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.inspection import permutation_importance
import shap

data_raw = pd.read_csv('https://raw.githubusercontent.com/Thomas-Richardson/Blog_post_data/main/ESS7.csv')

## Clean data

In [None]:
data = data_raw.drop(columns = ['ess7_id','nuts1','nuts2','nuts3','ess7_reg','Unnamed: 63']) # chuck out useless columns

In [None]:
data.columns

data = data.rename(columns = {'cntry':'country', 'tvpol':'daily_tv_consumption','ppltrst':'people_trustworthy', 'pplfair':'people_fair','pplhlp':'people_helpful','lrscale':'political_orientation_lr',
    'stfeco':'satisfaction_economy','stfgov':'satisfaction_government','stfdem':'satisfaction_democracy_here','stfedu':'satisfaction_education_system','stfhlth':'satisfaction_healthcare_system',
    'sclmeet':'meet_friends&family_often','inprdsc':'people_to_confide_in','sclact':'social_life','crmvct':'burglary_assault_victim_5y','aesfdrk':'fear_area_after_dark','hlthhmp':'disability',
    'rlgblg':'religious','dscrgrp':'oppressed_group','ctzcntr':'citizen','blgetmg':'minority','etfruit':'eat_fruit','eatveg':'eat_veg','dosprt':'sport','cgtsmke':'smoker','alcfreq':'alcohol_often',
    'alcbnge':'binge_drinking','slprl':'sleep_restless_past_week','fltlnl':'lonely_past_week','cnfpplh':'family_conflict_childhood','fnsdfml':'childhood_financial_problems','gndr':'sex','agea':'age',
    'hincfel':'feeling_about_income','atncrse':'improved_knowledge','maritalb':'marital_status','dvrcdeva':'ever_divorced','chldhm':'kids_at_home','domicil':'area_type','eduyrs':'years_education',
    'wkhct':'hours_overtime_excl', 'wkhtot':'hours_overtime_incl','nacer2':'industry','uemp3m':'ever_unemployed','hinctnta':'income_decile','psppsgv':'have_say_in_politics',
    'psppipl':'have_influence_in_politics','cptppol':'confident_participate_politics', 'ptcpplt':'politicians_listen','trstprl':'trust_parliament','trstlgl':'trust_legal_system',
    'trstplc':'trust_police','trstplt':'trust_politicans','trstprt':'trust_political_parties'})

#data.political_orientation_lr.value_counts(normalize=True,dropna=False).round(2).to_frame().reset_index().sort_values('index') # % of a column that is taken up by each category

In [None]:
#plt.rcParams['figure.figsize'] = [30, 30] # set figure size parameters
#data.hist(grid=False, bins = 10)

In [None]:
#plt.rcParams['figure.figsize'] = [5, 5] # set figure size parameters
#data.daily_tv_consumption.hist(grid = False, bins = 10)

In [None]:
five_cols = ['social_life','citizen','binge_drinking','sleep_restless_past_week','lonely_past_week','family_conflict_childhood','childhood_financial_problems','sex','ever_divorced','kids_at_home',
    'area_type','ever_unemployed','feeling_about_income','improved_knowledge']
data[five_cols] = data[five_cols].where(data[five_cols] < 5, np.nan)

seven_cols = ['marital_status','meet_friends&family_often','daily_tv_consumption','people_to_confide_in','burglary_assault_victim_5y','fear_area_after_dark','health','disability','religious',
    'oppressed_group','citizen','minority','smoker']
data[seven_cols] = data[seven_cols].where(data[seven_cols] < 7, np.nan)

ten_cols = ['happy','people_trustworthy','people_fair','people_helpful','political_orientation_lr','satisfaction_education_system','satisfaction_healthcare_system','satisfaction_economy','satisfaction_government',
    'satisfaction_democracy_here','eat_fruit','eat_veg','sport','alcohol_often','income_decile','have_say_in_politics','have_influence_in_politics','confident_participate_politics','politicians_listen',
    'trust_parliament','trust_legal_system','trust_police','trust_politicans','trust_political_parties']
data[ten_cols] = data[ten_cols].where(data[ten_cols] < 11, np.nan)

data.loc[data.height > 776,'height'] = np.nan
data.loc[data.weight > 776,'weight'] = np.nan
data.loc[data.age > 998,'age'] = np.nan
data.loc[data.years_education > 76,'years_education'] = np.nan
data.loc[data.hours_overtime_incl > 665,'hours_overtime_incl'] = np.nan
data.loc[data.hours_overtime_excl > 665,'hours_overtime_excl'] = np.nan
data.loc[data.industry > 665,'industry'] = np.nan

In [None]:
#data.isna().mean().round(3).sort_values(ascending = False)*100 # calculate % of data missing for each column

In [None]:
data = data.drop(columns = ['binge_drinking','childhood_financial_problems','family_conflict_childhood','hours_overtime_incl','hours_overtime_excl'])

In [None]:
data = data.fillna({'income_decile':5,'political_orientation_lr':5})

#data.loc[data.income_decile.isna(),'income_decile'] = 5
#data.loc[data.political_orientation_lr.isna(),'political_orientation_lr'] = 5

In [None]:
def invert_binary_variable(col_name):
    data.loc[:,col_name] = (3-data.loc[:,col_name])-1

def reverse_variable(col_name):
    x = data.loc[:,col_name].nunique()+1
    data.loc[:,col_name] = x-data.loc[:,col_name]

invert_binary_variable(['burglary_assault_victim_5y','religious','oppressed_group','citizen','minority','ever_divorced','improved_knowledge','kids_at_home','sex'])

reverse_variable(['fear_area_after_dark','eat_fruit','eat_veg', 'feeling_about_income','health','disability','alcohol_often'])

In [None]:
#plt.rcParams['figure.figsize'] = [30, 30] # set figure size parameters
#data.hist(grid=False, bins = 10)

In [None]:
data2 = pd.get_dummies(data, columns = ['marital_status','country', 'industry', 'area_type'], dummy_na = False)

In [None]:
data2.shape

In [None]:
data = data2.dropna()
data3.shape

In [None]:
data3.to_csv('ESS7_cleaned.csv')

Looking at the variables, we can see clear separation in lonely: those with low loneliness tended to consistently add about 0.25 points of happiness, whereas those who scored high on loneliness varied in how much it affected their happiness. Some lost 0.1 happiness points, some as much as 1.5!

## Misc

In [None]:
data3.loc[:,['happy','satisfaction_healthcare_system']].corr()

In [None]:
#data.loc[:'overtime'] = data.loc[:'hours_overtime_incl'] - data.loc[:'hours_overtime_excl']

#data.happy.value_counts(normalize=True,dropna=False).round(2).to_frame().reset_index().sort_values('index')

#data['political_orientation_binned']  = pd.cut(x = data['political_orientation_lr'], bins=[0,4,6, 10]) # 11% of people did give political orientation, perhaps bin it?

#plt.rcParams['figure.figsize'] = [10, 5] # set figure size parameters
#sns.countplot(y = 'political_orientation_binned', data = data)
#plt.show()

#data.isna().mean().round(3).sort_values(ascending = False)*100 # calculate % of data missing for each column
#data2 = data.drop(columns = ['political_orientation_lr']).dropna()

In [None]:
import lime
from lime import lime_tabular

explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    class_names=['bad', 'good'],
    mode='classification'
)

exp = explainer.explain_instance(
    data_row=X_test.iloc[1], 
    predict_fn=model.predict_proba
)

exp.show_in_notebook(show_table=True)

In [None]:
dfCorr = data.loc[:,variable_to_correlate].corr(method = 'spearman')
filteredDf = dfCorr[((dfCorr >= .6) | (dfCorr <= -.6))]
mask = np.zeros_like(dfCorr)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(30,10))
sns.heatmap(filteredDf, annot=True, mask= mask, cmap="BuPu")
plt.show()

In [None]:
#variable_to_correlate = data.columns[data.nunique()>3]
#variable_to_correlate= variable_to_correlate.drop(['country','marital_status','industry'])
#
#plt.rcParams['figure.figsize'] = [30, 30] # set figure size parameters
#
#correlation_matrix = data.loc[:,variable_to_correlate].corr(method = 'spearman').round(1)
#mask = np.zeros_like(correlation_matrix)
#mask[np.triu_indices_from(mask)] = True
#sns.heatmap(correlation_matrix, annot = True,vmin = -1, vmax=1,mask=mask, square=True, cmap="BuPu")
#plt.show()
#variable_to_correlatedata.loc[:,variable_to_correlate].corr(method = 'spearman').abs().unstack().sort_values(ascending=False).drop_duplicates()