In [19]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.utils import resample

from imblearn.over_sampling import SMOTE

In [20]:
X = pd.read_csv('references/training_set_values.csv', index_col = 'id')
y = pd.read_csv('references/training_set_labels.csv', index_col = 'id')['status_group']
df = X.copy()
df['target'] = y.copy()

In [21]:
df['target'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: target, dtype: int64

In [22]:
# df.drop[col = ['num_private'], inplace = True]

In [23]:
df.head()

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


# Basic FSM

A basic logistic regression model that does not have anything applied to it other than a max_iter to keep errors from popping up.

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)


In [25]:
X_train_num = X_train.select_dtypes(exclude = ['object'])
lg = LogisticRegression(max_iter = 1000)
lg.fit(X_train_num, y_train)
cross_val_score(lg, X_train_num, y_train, cv = 5)

array([0.55566779, 0.55185185, 0.54848485, 0.54893378, 0.55252525])

The basic FSM has an r<sup>2</sup> score of around 0.55

In [9]:
X['construction_year'].replace({0: np.nan}, inplace = True)

In [10]:
X.isna().sum()

amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year        20709
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity

In [11]:
# keep nans in construction year; create a new column that marks if it's nan or not
# decision tree

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)
X_train_num = X.select_dtypes(exclude = ['object'])

In [13]:
si = SimpleImputer()
X_train_num_si = pd.DataFrame(si.fit_transform(X_train_num), index = X_train_num.index, columns = X_train_num.columns)
X_train_num_si['construction_year'].value_counts()

1996.814686    20709
2010.000000     2645
2008.000000     2613
2009.000000     2533
2000.000000     2091
2007.000000     1587
2006.000000     1471
2003.000000     1286
2011.000000     1256
2004.000000     1123
2012.000000     1084
2002.000000     1075
1978.000000     1037
1995.000000     1014
2005.000000     1011
1999.000000      979
1998.000000      966
1990.000000      954
1985.000000      945
1996.000000      811
1980.000000      811
1984.000000      779
1982.000000      744
1994.000000      738
1972.000000      708
1974.000000      676
1997.000000      644
1992.000000      640
1993.000000      608
2001.000000      540
1988.000000      521
1983.000000      488
1975.000000      437
1986.000000      434
1976.000000      414
1970.000000      411
1991.000000      324
1989.000000      316
1987.000000      302
1981.000000      238
1977.000000      202
1979.000000      192
1973.000000      184
2013.000000      176
1971.000000      145
1960.000000      102
1967.000000       88
1963.000000  

# Upsampling our minority targets

In [14]:
# df_0 = df[df.target == 'functional']
# df_1 = df[df.target == 'functional needs repair']
# df_2 = df[df.target == 'non functional']

# df_1_upsample = resample(df_1, replace = True, n_samples = 32259, random_state = 42)
# df_2_upsample = resample(df_2, replace = True, n_samples = 32259, random_state = 42)
# df_up = pd.concat([df_0, df_1_upsample, df_2_upsample])

In [15]:
# df['target'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: target, dtype: int64

In [16]:
# df_up['target'].value_counts()

functional needs repair    32259
non functional             32259
functional                 32259
Name: target, dtype: int64

In [17]:
# X_train, X_test, y_train, y_test = train_test_split(df_up.drop(columns = ['target']), df_up['target'], random_state = 42, test_size = 0.25)
# X_train_num = X_train.select_dtypes(exclude = ['object'])
# lg = LogisticRegression(max_iter = 1000)
# lg.fit(X_train_num, y_train)
# cross_val_score(lg, X_train_num, y_train, cv = 5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


KeyboardInterrupt: 

In [28]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train_num, y_train)

In [30]:
X_train_num.shape

(44550, 9)

In [31]:
X_train.shape

(44550, 39)

In [32]:
X_train_resampled.shape

(72483, 9)

In [33]:
y_train.shape

(44550,)

In [34]:
y_train_resampled.shape

(72483,)

In [36]:
y_train.value_counts()

functional                 24161
non functional             17146
functional needs repair     3243
Name: status_group, dtype: int64

In [37]:
y_train_resampled.value_counts()

functional needs repair    24161
non functional             24161
functional                 24161
Name: status_group, dtype: int64