In [8]:
# Specific Libraries
import datawig

# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [9]:
# Import into pandas dataframe
df = pd.read_excel('16raw.xlsx')
        
df_train = df.loc[df['Missing'] == 0]
df_test = df.loc[df['Missing'] == 1]

In [11]:
# Uses values provided from World Happiness Report
predictors = ["Life Ladder","Log GDP per capita (PPP)", "Social Support", "Log Healthy life expectancy at birth", "Freedom to make life choices", "Generosity", "Perceptions of corruption", "Happiness"]

# 1. `Log GDP (PPP)` Imputation

In [7]:
#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=predictors, # column(s) containing information about the column we want to impute
    output_column='Log GDP per capita (PPP)', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(df_test)

outputdf = imputed[['Country','Log GDP per capita (PPP)','Log GDP per capita (PPP)_imputed']]

# Code to generate error between `Log GDP per capita` and the `imputed Log GDP per capita`.
y_errs = 100 * abs(imputed["Log GDP per capita (PPP)"] - imputed["Log GDP per capita (PPP)_imputed"]) / imputed["Log GDP per capita (PPP)"]
# Code to generate dataframe to the existing data frame
y_errs = pd.DataFrame(y_errs, columns = ["Error"], index = outputdf.index)
# Code to concat error dataframe to the existing data frame
outputdf_final = pd.concat([outputdf, y_errs], axis = 1)

#Print output
outputdf_final

#Outputs to excel with imputed values
export_excel = outputdf_final.to_excel (r'C:\Users\SAMARITAN\Documents\Jupyter_Thunder\Final_Project\1. Log GDP per capita (PPP)_imputed.xlsx', index = None, header=True) #Don't forget to add '.xlsx' at the end of the path

2019-04-15 16:18:09,614 [INFO]  Assuming 8 numeric input columns: Life Ladder, Log GDP per capita (PPP), Social Support, Healthy life expectancy at birth, Freedom to make life choices, Generosity, Perceptions of corruption, Happiness
2019-04-15 16:18:09,615 [INFO]  Assuming 0 string input columns: 
2019-04-15 16:18:09,618 [INFO]  No output column name provided for ColumnEncoder using Log GDP per capita (PPP)
2019-04-15 16:18:09,620 [INFO]  Assuming numeric output column: Log GDP per capita (PPP)
2019-04-15 16:18:09,623 [INFO]  Using [[cpu(0)]] as the context for training
2019-04-15 16:18:09,628 [INFO]  Fitting label encoder <class 'datawig.column_encoders.NumericalEncoder'> on 108 rows                             of training data
2019-04-15 16:18:09,635 [INFO]  Detected 0 rows with missing labels                         for column Log GDP per capita (PPP)
2019-04-15 16:18:09,637 [INFO]  Dropping 0/108 rows
2019-04-15 16:18:09,640 [INFO]  Detected 0 rows with missing labels             

2019-04-15 16:18:10,135 [INFO]  Epoch[5] Time cost=0.032
2019-04-15 16:18:10,157 [INFO]  Saved checkpoint to "imputer_model\model-0005.params"
2019-04-15 16:18:10,161 [INFO]  Epoch[5] Validation-cross-entropy=0.795112
2019-04-15 16:18:10,163 [INFO]  Epoch[5] Validation-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:10,179 [INFO]  Epoch[6] Batch [0-4]	Speed: 8001.30 samples/sec	cross-entropy=0.421695	Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:10,188 [INFO]  Epoch[6] Train-cross-entropy=0.356839
2019-04-15 16:18:10,190 [INFO]  Epoch[6] Train-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:10,194 [INFO]  Epoch[6] Time cost=0.027
2019-04-15 16:18:10,211 [INFO]  Saved checkpoint to "imputer_model\model-0006.params"
2019-04-15 16:18:10,217 [INFO]  Epoch[6] Validation-cross-entropy=0.511512
2019-04-15 16:18:10,218 [INFO]  Epoch[6] Validation-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:10,233 [INFO]  Epoch[7] Batch [0-4]	Speed: 7112.95 

2019-04-15 16:18:10,943 [INFO]  Epoch[18] Validation-cross-entropy=0.161745
2019-04-15 16:18:10,944 [INFO]  Epoch[18] Validation-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:10,963 [INFO]  Epoch[19] Batch [0-4]	Speed: 5332.87 samples/sec	cross-entropy=0.051719	Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:10,972 [INFO]  Epoch[19] Train-cross-entropy=0.044119
2019-04-15 16:18:10,974 [INFO]  Epoch[19] Train-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:10,976 [INFO]  Epoch[19] Time cost=0.030
2019-04-15 16:18:11,008 [INFO]  Saved checkpoint to "imputer_model\model-0019.params"
2019-04-15 16:18:11,013 [INFO]  Epoch[19] Validation-cross-entropy=0.152849
2019-04-15 16:18:11,018 [INFO]  Epoch[19] Validation-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:11,036 [INFO]  Epoch[20] Batch [0-4]	Speed: 5815.33 samples/sec	cross-entropy=0.046988	Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:11,048 [INFO]  Epoch[20] Train-cross-en

2019-04-15 16:18:11,714 [INFO]  Epoch[31] Validation-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:11,729 [INFO]  Epoch[32] Batch [0-4]	Speed: 9143.52 samples/sec	cross-entropy=0.016993	Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:11,738 [INFO]  Epoch[32] Train-cross-entropy=0.014584
2019-04-15 16:18:11,739 [INFO]  Epoch[32] Train-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:11,741 [INFO]  Epoch[32] Time cost=0.024
2019-04-15 16:18:11,759 [INFO]  Saved checkpoint to "imputer_model\model-0032.params"
2019-04-15 16:18:11,764 [INFO]  Epoch[32] Validation-cross-entropy=0.090726
2019-04-15 16:18:11,766 [INFO]  Epoch[32] Validation-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:11,782 [INFO]  Epoch[33] Batch [0-4]	Speed: 7985.82 samples/sec	cross-entropy=0.015766	Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:11,789 [INFO]  Epoch[33] Train-cross-entropy=0.013580
2019-04-15 16:18:11,791 [INFO]  Epoch[33] Train-Log GDP per c

2019-04-15 16:18:12,480 [INFO]  Epoch[45] Batch [0-4]	Speed: 9146.32 samples/sec	cross-entropy=0.007168	Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:12,493 [INFO]  Epoch[45] Train-cross-entropy=0.006470
2019-04-15 16:18:12,494 [INFO]  Epoch[45] Train-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:12,496 [INFO]  Epoch[45] Time cost=0.028
2019-04-15 16:18:12,515 [INFO]  Saved checkpoint to "imputer_model\model-0045.params"
2019-04-15 16:18:12,520 [INFO]  Epoch[45] Validation-cross-entropy=0.064705
2019-04-15 16:18:12,523 [INFO]  Epoch[45] Validation-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:12,541 [INFO]  Epoch[46] Batch [0-4]	Speed: 5335.95 samples/sec	cross-entropy=0.006781	Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:12,551 [INFO]  Epoch[46] Train-cross-entropy=0.006137
2019-04-15 16:18:12,555 [INFO]  Epoch[46] Train-Log GDP per capita (PPP)-accuracy=0.000000
2019-04-15 16:18:12,557 [INFO]  Epoch[46] Time cost=0.031
2019-04-

2019-04-15 16:18:13,075 [INFO]  Top-k only for CategoricalEncoder, dropping Log GDP per capita (PPP), <class 'datawig.column_encoders.NumericalEncoder'>
2019-04-15 16:18:13,076 [INFO]  Precision filtering only for CategoricalEncoder returning                             Log GDP per capita (PPP) unfiltered


# 2. `Social Support` Imputation
Not required, because all data is available.

# 3. Log Healthy life expectancy at birth Imputation

In [12]:
#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=predictors, # column(s) containing information about the column we want to impute
    output_column='Log Healthy life expectancy at birth', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(df_test)

outputdf = imputed[['Country','Log Healthy life expectancy at birth','Log Healthy life expectancy at birth_imputed']]

# Code to generate error between `Healthy life expectancy at birth` and the `imputed Healthy life expectancy at birth`.
y_errs = 100 * abs(imputed["Log Healthy life expectancy at birth"] - imputed["Log Healthy life expectancy at birth_imputed"]) / imputed["Healthy life expectancy at birth"]
y_errs = pd.DataFrame(y_errs, columns = ["Error"], index = outputdf.index)
# Code to concat error dataframe to the existing data frame
outputdf_final = pd.concat([outputdf, y_errs], axis = 1)

#Print output
outputdf_final

#Outputs to excel with imputed values
export_excel = outputdf_final.to_excel (r'C:\Users\SAMARITAN\Documents\Jupyter_Thunder\Final_Project\3. Log Healthy life expectancy at birth_imputed.xlsx', index = None, header=True) #Don't forget to add '.xlsx' at the end of the path

2019-04-15 16:56:22,572 [INFO]  Assuming 8 numeric input columns: Life Ladder, Log GDP per capita (PPP), Social Support, Log Healthy life expectancy at birth, Freedom to make life choices, Generosity, Perceptions of corruption, Happiness
2019-04-15 16:56:22,577 [INFO]  Assuming 0 string input columns: 
2019-04-15 16:56:22,585 [INFO]  No output column name provided for ColumnEncoder using Log Healthy life expectancy at birth
2019-04-15 16:56:22,586 [INFO]  Assuming numeric output column: Log Healthy life expectancy at birth
2019-04-15 16:56:22,604 [INFO]  Using [[cpu(0)]] as the context for training
2019-04-15 16:56:22,614 [INFO]  Fitting label encoder <class 'datawig.column_encoders.NumericalEncoder'> on 108 rows                             of training data
2019-04-15 16:56:22,633 [INFO]  Detected 0 rows with missing labels                         for column Log Healthy life expectancy at birth
2019-04-15 16:56:22,635 [INFO]  Dropping 0/108 rows
2019-04-15 16:56:22,638 [INFO]  Detected

2019-04-15 16:56:23,112 [INFO]  Epoch[5] Batch [0-4]	Speed: 8002.97 samples/sec	cross-entropy=0.892533	Log Healthy life expectancy at birth-accuracy=0.000000
2019-04-15 16:56:23,119 [INFO]  Epoch[5] Train-cross-entropy=0.746002
2019-04-15 16:56:23,120 [INFO]  Epoch[5] Train-Log Healthy life expectancy at birth-accuracy=0.000000
2019-04-15 16:56:23,122 [INFO]  Epoch[5] Time cost=0.023
2019-04-15 16:56:23,143 [INFO]  Saved checkpoint to "imputer_model\model-0005.params"
2019-04-15 16:56:23,148 [INFO]  Epoch[5] Validation-cross-entropy=0.645883
2019-04-15 16:56:23,151 [INFO]  Epoch[5] Validation-Log Healthy life expectancy at birth-accuracy=0.000000
2019-04-15 16:56:23,166 [INFO]  Epoch[6] Batch [0-4]	Speed: 8001.53 samples/sec	cross-entropy=0.586197	Log Healthy life expectancy at birth-accuracy=0.000000
2019-04-15 16:56:23,172 [INFO]  Epoch[6] Train-cross-entropy=0.475609
2019-04-15 16:56:23,173 [INFO]  Epoch[6] Train-Log Healthy life expectancy at birth-accuracy=0.000000
2019-04-15 16:5

# 4. `Freedom to make life choices` Imputation

In [None]:
#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=predictors, # column(s) containing information about the column we want to impute
    output_column='Freedom to make life choices', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(df_test)

outputdf = imputed[['Country','Freedom to make life choices','Freedom to make life choices_imputed']]

# Code to generate error between `Freedom to make life choices` and the `imputed Freedom to make life choices`.
y_errs = 100 * abs(imputed["Freedom to make life choices"] - imputed["Freedom to make life choices_imputed"]) / imputed["Freedom to make life choices"]
y_errs = pd.DataFrame(y_errs, columns = ["Error"], index = outputdf.index)
# Code to concat error dataframe to the existing data frame
outputdf_final = pd.concat([outputdf, y_errs], axis = 1)

#Print output
outputdf_final

#Outputs to excel with imputed values
export_excel = outputdf_final.to_excel (r'C:\Users\SAMARITAN\Documents\Jupyter_Thunder\Final_Project\4. Freedom to make life choices_imputed.xlsx', index = None, header=True) #Don't forget to add '.xlsx' at the end of the path

# 5. 'Generosity' Imputation

In [None]:
#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=predictors, # column(s) containing information about the column we want to impute
    output_column='Generosity', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(df_test)

outputdf = imputed[['Country','Generosity','Generosity_imputed']]

# Code to generate error between `Generosity` and the `imputed Generosity`.
y_errs = 100 * abs(imputed["Generosity"] - imputed["Generosity_imputed"]) / imputed["Generosity"]
y_errs = pd.DataFrame(y_errs, columns = ["Error"], index = outputdf.index)
# Code to concat error dataframe to the existing data frame
outputdf_final = pd.concat([outputdf, y_errs], axis = 1)

#Print output
outputdf_final

#Outputs to excel with imputed values
export_excel = outputdf_final.to_excel (r'C:\Users\SAMARITAN\Documents\Jupyter_Thunder\Final_Project\5. Generosity_imputed.xlsx', index = None, header=True) #Don't forget to add '.xlsx' at the end of the path

# 6. 'Perceptions of corruption' Imputation

In [None]:
#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=predictors, # column(s) containing information about the column we want to impute
    output_column='Perceptions of corruption', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(df_test)

outputdf = imputed[['Country','Perceptions of corruption','Perceptions of corruption_imputed']]

# Code to generate error between `Perceptions of corruption` and the `imputed Perceptions of corruption`.
y_errs = 100 * abs(imputed["Perceptions of corruption"] - imputed["Perceptions of corruption_imputed"]) / imputed["Perceptions of corruption"]
y_errs = pd.DataFrame(y_errs, columns = ["Error"], index = outputdf.index)
# Code to concat error dataframe to the existing data frame
outputdf_final = pd.concat([outputdf, y_errs], axis = 1)

#Print output
outputdf_final

#Outputs to excel with imputed values
export_excel = outputdf_final.to_excel (r'C:\Users\SAMARITAN\Documents\Jupyter_Thunder\Final_Project\6. Perceptions of corruption_imputed.xlsx', index = None, header=True) #Don't forget to add '.xlsx' at the end of the path

# 7. 'Happiness' Imputation
For `Hong Kong`.

In [None]:
#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=predictors, # column(s) containing information about the column we want to impute
    output_column='Happiness', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(df_test)

outputdf = imputed[['Country','Happiness','Happiness_imputed']]

# Code to generate error between `Happiness` and the `imputed Happiness`.
y_errs = 100 * abs(imputed["Happiness"] - imputed["Happiness_imputed"]) / imputed["Happiness"]
y_errs = pd.DataFrame(y_errs, columns = ["Error"], index = outputdf.index)
# Code to concat error dataframe to the existing data frame
outputdf_final = pd.concat([outputdf, y_errs], axis = 1)

#Print output
outputdf_final

#Outputs to excel with imputed values
export_excel = outputdf_final.to_excel (r'C:\Users\SAMARITAN\Documents\Jupyter_Thunder\Final_Project\7. Happiness_imputed.xlsx', index = None, header=True) #Don't forget to add '.xlsx' at the end of the path