In [1]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn import preprocessing
import numpy as np
import statsmodels.api as sm

from stats_utils import MLR

df = pd.read_csv("../data/WHO-SIMPLE.csv")

In [2]:
df.columns

Index(['Country_Year', 'MDG_0000000001', 'PCV3', 'ROTAC', 'WHS4_100',
       'WHS4_117', 'WHS4_129', 'WHS4_543', 'WHS4_544', 'WHS8_110', 'MCV2',
       'NUTRITION_564', 'WHS4_128', 'LBW_NUMBER', 'LBW_PREVALENCE',
       'NUTRITION_HA_2', 'NUTRITION_WA_2', 'NUTRITION_WH2', 'NUTRITION_WH_2',
       'WHOSIS_000005', 'WHOSIS_000006', 'MDG_0000000026', 'WHS9_95',
       'WHS_PBR', 'WSH_2', 'WSH_3', 'WSH_SANITATION_SAFELY_MANAGED',
       'M_Est_smk_curr', 'M_Est_smk_daily', 'TOBACCO_0000000192',
       'GHED_CHEGDP_SHA2011', 'WHS9_85', 'Country Code', 'Year', 'GDP'],
      dtype='object')

In [3]:
usa = df[df['Country Code'] == "USA"]
usa = usa[usa['Year'] >= 2000]

In [6]:
usa = usa.dropna(thresh=5, axis=1)

In [9]:
usa = usa.drop(['PCV3', 'ROTAC'], axis=1)

In [11]:
usa = usa.dropna(how='any')

In [13]:
usa.columns

Index(['Country_Year', 'MDG_0000000001', 'WHS4_100', 'WHS4_117', 'WHS4_129',
       'WHS4_544', 'WHS8_110', 'MCV2', 'LBW_NUMBER', 'LBW_PREVALENCE',
       'MDG_0000000026', 'WSH_SANITATION_SAFELY_MANAGED',
       'GHED_CHEGDP_SHA2011', 'Country Code', 'Year', 'GDP'],
      dtype='object')

In [15]:
explanatory_vars = [
    'WHS4_100', 
    'WHS4_117', 
    'WHS4_129',
    'WHS4_544',
    'WHS8_110',
    'MCV2',
    'LBW_NUMBER',
    'LBW_PREVALENCE',
    'MDG_0000000026',
    'WSH_SANITATION_SAFELY_MANAGED',
    'GHED_CHEGDP_SHA2011',
    'GDP'
]
response_var = 'MDG_0000000001'

MLR(usa, response_var, explanatory_vars)

MLR Results using Sci-kit Learn:
Intercept: 
 84.34212770208713
Coefficients: 
 [ 7.38891990e-03 -3.23413748e-03 -7.13132612e-03 -8.72731954e-03
  1.29458569e-02 -1.54420746e-04  4.35818395e-04  1.32798734e-01
 -1.36600617e-02 -8.88507285e-01 -2.91914218e-02  4.02581614e-14]

                            OLS Regression Results                            
Dep. Variable:         MDG_0000000001   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                     626.9
Date:                Fri, 26 Mar 2021   Prob (F-statistic):           4.20e-07
Time:                        11:48:45   Log-Likelihood:                 48.430
No. Observations:                  16   AIC:                            -74.86
Df Residuals:                       5   BIC:                            -66.36
Df Model:                          10                                         
Covariance T

  "anyway, n=%i" % int(n))


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1e9a7bb8d30>,
 10659    7.102501
 10660    7.048594
 10661    6.928501
 10662    6.885447
 10663    6.811521
 10664    6.743192
 10665    6.677381
 10666    6.569558
 10667    6.469457
 10668    6.361755
 10669    6.233547
 10670    6.146232
 10671    6.034202
 10672    5.974738
 10673    5.876853
 10674    5.807188
 dtype: float64)

## The code below eliminates columns with less than 500 actual values, and rows with less than 52.7% cells populated

In [None]:
#Examine data frame and determine which year has the most complete data set
df.head()
df.describe()

df.groupby(['Year']).count().sum(axis=1)
df = df.dropna(thresh=500, axis=1)


df['SUM'] = df.count(axis=1)
df = df[df['SUM'] >= 16]

## Below is MLR Code, we must first preprocess

In [None]:
# def scatterplot_2vars(df, x, y):
#     plt.scatter(df[x], df[y], color='green')
#     plt.title('{} Vs {}'.format(x,y), fontsize=14)
#     plt.xlabel(x, fontsize=14)
#     plt.ylabel(y, fontsize=14)
#     plt.grid(True)
#     plt.show()

# def histogram(df, x):
#     plt.hist(df[x], bins="auto", range=(0,df[x].max()))
#     plt.title('{}'.format(x), fontsize=14)
#     plt.xlabel(x, fontsize=14)
#     plt.grid(True)
#     plt.show()

In [None]:
# #This is to examine if the relationships are linear. Not all are, but many are good! A few may need a 
# # transformation (ie GDP) and a few may not work ultimately
# columns_of_interest = ['PCV3',#'ROTAC','NUTRITION_564',
#                        'WHS4_100','WHS4_117','WHS4_129','WHS4_543','WHS4_544','WHS8_110',
#                        'MCV2','WHS4_128','LBW_NUMBER','LBW_PREVALENCE',
#                        #'NUTRITION_HA_2','NUTRITION_WA_2',#'NUTRITION_WH2','NUTRITION_WH_2','WHOSIS_000006',
#                        'MDG_0000000026','WSH_SANITATION_SAFELY_MANAGED',
#                        #'WHS9_95','WHS_PBR','WSH_2','WSH_3',
#                        #'M_Est_smk_curr','M_Est_smk_daily','TOBACCO_0000000192',
#                        'GHED_CHEGDP_SHA2011','GDP']

# df_transformed = df.copy()
# log_col_transform = ['GDP','LBW_NUMBER','LBW_PREVALENCE','MDG_0000000026','GHED_CHEGDP_SHA2011']

# for col in columns_of_interest:
#     df_transformed[col] = np.log(df[col])
#     scatterplot_2vars(df_transformed, col, 'MDG_0000000001')
#     histogram(df_transformed, col)

# histogram(df_transformed, 'MDG_0000000001')

In [None]:
counts = df.count()
counts

In [None]:
haveNAN = df.columns[df.isnull().any()]


In [None]:
haveNAN

In [None]:
df.describe()

In [None]:
#Process through all the columns with NaNs:
for feature in haveNAN:
    print(feature)
    bins = df[feature].value_counts(bins=100, sort=False, dropna=False, )
    bins_index = bins.index
    bins_index = bins_index.set_closed("left")
    print("first_interval should be ({},{}, closed='left')\n".format(-1, bins_index[0].left))
    first_interval = pd.Interval(-1, bins_index[0].left, closed='left')
    bins_index = bins_index.insert(0,first_interval)
    df[feature] = df[feature].fillna(-1)
    df[feature] = pd.cut(df[feature], bins=bins_index, right=False)

In [None]:
df.head()

In [None]:
df['PCV3'].value_counts()

In [None]:
#This was good for experimentation, but it creates a factor for every unique value, and since
#the values aren't ordered, it's basically useless besides telling us that -1 is for missing values
codes, uniques = pd.factorize(df['PCV3'])

In [None]:
codes

In [None]:
uniques

In [None]:
x_and_y_cols = columns_of_interest.copy()
x_and_y_cols.append('MDG_0000000001')
df_no_nan = df_transformed[x_and_y_cols].copy().dropna()
X = df_no_nan[columns_of_interest] # Our multiple variables
Y = df_no_nan['MDG_0000000001']

In [None]:
len(X)


In [None]:
regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# with statsmodels
X = sm.add_constant(X) # adding a constant
 
model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 
 
print_model = model.summary()
print(print_model)

In [None]:
model.conf_int()