
# Dental Health Prediction Data Wrangling


In [3]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from scipy import stats


### Load and clean Literacy data

In [4]:
literacy = pd.read_csv('adultliteracy.csv')
literacy.rename(index = str, columns={'Adult..15...literacy.rate......Total':'Country'}, inplace=True)
literacy.dropna(how='all', subset = list(literacy.columns)[1:], inplace=True)
literacy['literacy_avg'] = literacy[list(literacy.columns)[1:30]].mean(axis=1,skipna = True)
literacy.drop(list(literacy.columns)[1:literacy.shape[1]-1], axis = 1, inplace = True)
print(literacy.head())
print(literacy.describe())

       Country  literacy_avg
0  Afghanistan     18.157681
1      Albania     98.712978
2      Algeria     59.752193
4       Angola     67.405416
5     Anguilla     95.407098
       literacy_avg
count    147.000000
mean      73.945211
std       23.471539
min        9.391331
25%       60.319077
50%       80.105363
75%       94.873104
max       99.746818


### Load and clean dental records

In [5]:
badteeth = pd.read_csv('badteeth.csv')
badteeth.drop(['NA..1', 'NA..2', 'NA..3'], axis = 1, inplace = True)
badteeth.dropna(inplace = True)
badteeth.rename(columns = {'NA.':'Country', 'X2004':'badteeth'}, inplace=True)
print(badteeth.head())
print(badteeth.describe())

       Country  badteeth
0  Afghanistan      2.90
1      Albania      3.02
2      Algeria      2.30
3       Angola      1.70
4     Anguilla      2.50
         badteeth
count  190.000000
mean     2.132947
std      1.292823
min      0.200000
25%      1.142500
50%      1.800000
75%      2.975000
max      6.300000


### Load and clean GDP info

In [6]:
GDP = pd.read_csv('gdp.csv')
GDP.rename(index = str, columns={'Income.per.person..fixed.2000.US..':'Country'}, inplace=True)
GDP.dropna(how="all", subset = list(GDP.columns)[1:], inplace=True)
GDP['GDP_avg'] = GDP[list(GDP.columns)[1:44]].mean(axis=1,skipna = True)
GDP.drop(list(GDP.columns)[1:GDP.shape[1]-1], axis = 1, inplace = True)
print(GDP.head())
print(GDP.describe())

               Country       GDP_avg
3              Albania   1011.752695
4              Algeria   1625.140989
6              Andorra  16623.189750
7               Angola    318.739949
9  Antigua and Barbuda   7874.822798
            GDP_avg
count    200.000000
mean    6051.864501
std     9577.248902
min      124.949106
25%      565.390304
50%     1719.609992
75%     6775.361786
max    65922.599960


### Load and clean Health Expenditure data

In [7]:
healthexp = pd.read_csv('healthexpend.csv')
healthexp.rename(columns = {'Per.capita.government.expenditure.on.health.at.average.exchange.rate..US..':'Country'}, inplace = True)
healthexp.dropna(how='all', subset = list(healthexp.columns)[1:], inplace=True) 
healthexp['health_avg'] = healthexp[list(healthexp.columns)[1:10]].mean(axis=1,skipna = True)
healthexp.drop(list(healthexp.columns)[1:healthexp.shape[1]-1], axis = 1, inplace = True)
print(healthexp.tail())
print(healthexp.describe())

       Country  health_avg
246  Venezuela   76.519263
250    Vietnam    6.336942
254      Yemen   12.509311
256     Zambia   13.064692
257   Zimbabwe   27.343722
        health_avg
count   192.000000
mean    327.700596
std     620.587169
min       0.372222
25%      12.450760
50%      65.954780
75%     267.774253
max    2890.305843


### Load and clean Sugar Consumption info

In [8]:
sugar = pd.read_csv('sugar_consumption.csv')
sugar.rename(columns = {'NA.':'Country'}, inplace = True)
sugar.dropna(how='all', subset = list(sugar.columns)[1:], inplace=True)
sugar['sugar_avg'] = sugar[list(sugar.columns)[1:]].mean(axis=1,skipna = True)
sugar.drop(list(sugar.columns)[1:sugar.shape[1]-1], axis = 1, inplace = True)
print(sugar.head())
print(sugar.describe())

                Country   sugar_avg
3               Albania   49.504318
4               Algeria   67.995227
7                Angola   33.129091
9   Antigua and Barbuda   98.443182
10            Argentina  111.458182
        sugar_avg
count  179.000000
mean    74.313813
std     42.231186
min      2.677727
25%     37.234187
50%     80.135909
75%    108.406818
max    165.130455


### Load and clean Water Sanitation info

In [9]:
cleanwater = pd.read_csv('cleanwater.csv')
cleanwater.rename(columns = {'Unnamed: 0':'Country'}, inplace = True)
cleanwater.set_index('Country')
cleanwater.dropna(how='all', subset = list(cleanwater.columns)[1:], inplace=True)
cleanwater=cleanwater.iloc[2:, :]

cleanwater['2004'] = pd.to_numeric(cleanwater['2004'])
cleanwater['2003'] = pd.to_numeric(cleanwater['2003'])
cleanwater['2002'] = pd.to_numeric(cleanwater['2002'])
cleanwater['2001'] = pd.to_numeric(cleanwater['2001'])
cleanwater['2000'] = pd.to_numeric(cleanwater['2000'])
cleanwater['water_avg'] = cleanwater.mean(numeric_only=True, axis=1)
cleanwater.drop(list(cleanwater.columns)[1:cleanwater.shape[1]-1], axis = 1, inplace = True)
print(cleanwater.head())
print(cleanwater.describe())

       Country  water_avg
2  Afghanistan       30.2
3      Albania       88.0
4      Algeria       90.4
5      Andorra      100.0
6       Angola       38.4
        water_avg
count  188.000000
mean    81.165603
std     21.293685
min     17.000000
25%     68.550000
50%     89.500000
75%     98.050000
max    100.000000


### Merge all data and view basic correlations

In [12]:
demographics = badteeth.merge(literacy,on='Country', how = 'inner').merge(cleanwater,on='Country', how = 'inner').merge(GDP,on='Country', how = 'inner').merge(sugar,on='Country', how = 'inner').merge(healthexp,on='Country', how = 'inner')
demographics.to_csv('demographics.csv')
dem_corr = demographics.corr(method = 'spearman')
dem_corr

Unnamed: 0,badteeth,literacy_avg,water_avg,GDP_avg,sugar_avg,health_avg
badteeth,1.0,0.398946,0.376938,0.347446,0.351951,0.310519
literacy_avg,0.398946,1.0,0.624686,0.570191,0.492863,0.580152
water_avg,0.376938,0.624686,1.0,0.81285,0.66353,0.820977
GDP_avg,0.347446,0.570191,0.81285,1.0,0.763991,0.947432
sugar_avg,0.351951,0.492863,0.66353,0.763991,1.0,0.750462
health_avg,0.310519,0.580152,0.820977,0.947432,0.750462,1.0
