In [265]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import Imputer # this one is old, now use SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# from sklearn.feature_selection import RFE
# from sklearn.linear_model import BayesianRidge
# import statsmodels.api as sm
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [266]:
df_orig = pd.read_csv('rain.csv', index_col=0)
df_miss = pd.read_csv('RAIN_MISS.csv', index_col=0)

In [267]:
# drop additional columns indicators created in SAS
df_miss = df_miss.drop(['id', 'Missing1', 'Missing2'], axis=1)

In [268]:
# number of columns with missing values
print(df_orig.isnull().any().sum())
print(df_miss.isnull().any().sum())

0
4


In [269]:
print('Number of missing values in columns\n')
df_miss[['Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']].isnull().sum()

Number of missing values in columns



Humidity9am    22785
Humidity3pm    22785
Pressure9am    34035
Pressure3pm    34035
dtype: int64

Data preparation

In [270]:
# creating dummy variables
df_dummies_orig = pd.get_dummies(df_orig)
df_dummies_miss = pd.get_dummies(df_miss)

In [271]:
# dividing into features and dependent variable
X_orig = df_dummies_orig.drop('RainTomorrow', axis=1)
y_orig = df_dummies_orig['RainTomorrow']

X_miss = df_dummies_miss.drop('RainTomorrow', axis=1)
y_miss = df_dummies_miss['RainTomorrow']

In [272]:
# train-test splits
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.3, random_state=42)
X_train_miss, X_test_miss, y_train_miss, y_test_miss = train_test_split(X_miss, y_miss, test_size=0.3, random_state=42)

Logistic for original dataset

In [273]:
logreg_orig = LogisticRegression()

In [274]:
logreg_orig.fit(X_train_orig, y_train_orig)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [275]:
y_pred_orig = logreg_orig.predict(X_test_orig)

In [276]:
logreg_orig_score = logreg_orig.score(X_test_orig, y_test_orig)
logreg_orig_score

0.8521459354153138

Logistic for imputed dataset
- imputing with mean

In [277]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
logreg_miss_mean = LogisticRegression()
steps_mean = [('imputation', imp_mean), ('logistic_regression', logreg_miss_mean)]
pipeline_mean = Pipeline(steps_mean)

In [278]:
pipeline_mean.fit(X_train_miss, y_train_miss)

Pipeline(memory=None,
         steps=[('imputation',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('logistic_regression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [279]:
y_pred_miss_mean = pipeline_mean.predict(X_test_miss)

In [280]:
logreg_miss_mean_score = pipeline_mean.score(X_test_miss, y_test_miss)
logreg_miss_mean_score

0.840722592833107

- Multivariate feature imputation 

In [281]:
imp_multivar = IterativeImputer(max_iter=25, random_state=0)
logreg_miss_multivar = LogisticRegression()
steps_multivar = [('imputation', imp_multivar), ('logistic_regression', logreg_miss_multivar)]
pipeline_multivar = Pipeline(steps_multivar)

In [282]:
pipeline_multivar.fit(X_train_miss, y_train_miss)

Pipeline(memory=None,
         steps=[('imputation',
                 IterativeImputer(add_indicator=False, estimator=None,
                                  imputation_order='ascending',
                                  initial_strategy='mean', max_iter=25,
                                  max_value=None, min_value=None,
                                  missing_values=nan, n_nearest_features=None,
                                  random_state=0, sample_posterior=False,
                                  tol=0.001, verbose=0)),
                ('logistic_regression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbos

In [283]:
y_pred_miss_multivar = pipeline_mean.predict(X_test_miss)

In [284]:
logreg_miss_multivar_score = pipeline_multivar.score(X_test_miss, y_test_miss)
logreg_miss_multivar_score

0.8468917881811205

Summary of models

In [288]:
print('logreg score of \n original dataset:', logreg_orig_score, '\n', 'imputed with mean:', logreg_miss_mean_score, '\n', 'multivariate feature imputation:', logreg_miss_multivar_score)

logreg score of 
 original dataset: 0.8521459354153138 
 imputed with mean: 0.840722592833107 
 multivariate feature imputation: 0.8468917881811205


In [286]:
# from sklearn.metrics import classification_report
# print(classification_report(y_test_miss, y_pred_miss_multivar))