<a href="https://colab.research.google.com/github/ShreyasJothish/taarifa_water_pumps/blob/master/DS1_Predictive_Modeling_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load the data from Kaggle
!pip install kaggle



In [2]:
# Mount the drive to download the data from Kaggle
from google.colab import drive
drive.mount('/content/drive')
%env KAGGLE_CONFIG_DIR=/content/drive/My Drive

!kaggle competitions download -c ds1-predictive-modeling-challenge

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
env: KAGGLE_CONFIG_DIR=/content/drive/My Drive
sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
test_features.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train_labels.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train_features.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# Extract the csv files
!unzip train_features.csv.zip 
!unzip train_labels.csv.zip 
!unzip test_features.csv.zip

Archive:  train_features.csv.zip
replace train_features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Archive:  train_labels.csv.zip
replace train_labels.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Archive:  test_features.csv.zip
replace test_features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [4]:
!pip install -U seaborn

Requirement already up-to-date: seaborn in /usr/local/lib/python3.6/dist-packages (0.9.0)


In [0]:
# Generic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [0]:
# Loading the independent features as X and
# dependent variable as y
"""nan_values_dict = {'funder': ['Not Known', 'Unknown', 'None'],
                   'installer': ['Not known', 'not known', '-',
                                 'unknown', 'Unknown', 'Unknown Installer'],
                   'subvillage': ['##'],
                   'scheme_management': ['None'],
                   'scheme_name': ['None', 'not known', 'none']    
}"""

nan_values_list = ['Not Known', 'Unknown', 'None', 'Not known', 'not known', 
                   '-', 'unknown', 'Unknown Installer', '##', 'none']

train_features_df = pd.read_csv('train_features.csv', na_values=nan_values_list)
train_labels_df = pd.read_csv('train_labels.csv')

In [0]:
# Finding other possible NaN values 
null_cols = []

for col in train_features_df.columns:
  null_count = train_features_df[col].isnull().sum()
  null_cols.append(col)
  print(f'Column name {col}:')
  print(f'\tType {train_features_df[col].dtype}')
  print(f'\tNull count {null_count}')
  unique_values = train_features_df[col].unique()
  print(f'\tValue count {len(unique_values)}')
  #for value in unique_values:
    #print(f'\t\t{value}')

In [0]:
# Selecting independent and dependent variables.
# Iteration 1:
# X = train_features_df.select_dtypes(include=np.number)

# Iteration 2:
X = train_features_df.drop(columns =['id', 'date_recorded', 
                                                   'funder', 'installer', 
                                                   'wpt_name', 'subvillage', 
                                                   'ward', 'scheme_name'])
y = train_labels_df.status_group

In [0]:
# Split data into train and test using k-fold cross-validation
# with independent test data set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    random_state=42
                                                   )

In [11]:
# Get quick initial metrics estimate.

# Using sklearn accuracy_score
import numpy as np
from sklearn.metrics import accuracy_score

majority_class = y_train.mode()[0]
print(majority_class)
prediction = np.full(shape=y_train.shape, 
                     fill_value=majority_class)

accuracy_score(y_train, prediction)

functional


0.542334455667789

In [12]:
# Using simple pandas value counts method
print(y_train.value_counts(normalize=True))

functional                 0.542334
non functional             0.384871
functional needs repair    0.072795
Name: status_group, dtype: float64


In [13]:
!pip install category_encoders



In [0]:
# Data pre-processing, Feature selection and Model selection.

# Imports for pipeline
from sklearn.pipeline import make_pipeline

import category_encoders as ce
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression

# Create pipeline
pipeline = make_pipeline(\
                         ce.OneHotEncoder(use_cat_names=True),
                         RobustScaler(),
                         SelectKBest(f_classif),
                         LogisticRegression(multi_class='multinomial',
                                            solver='saga'))

In [0]:
# Model validation.
from sklearn.model_selection import GridSearchCV

param_grid = {
    'selectkbest__k': range(1, len(X_train.columns)+1),
    'logisticregression__class_weight': [None,'balanced']
}

gridsearch = GridSearchCV(pipeline, param_grid=param_grid, cv=3,
                         scoring='accuracy', verbose=1)

gridsearch.fit(X_train, y_train)

In [20]:
# Interpret the results.

# Best cross validation score
print('Cross Validation Score:', gridsearch.best_score_)

# Best parameters which resulted in the best score
print('Best Parameters:', gridsearch.best_params_)

"""# Which features were selected?
selector = gridsearch.best_estimator_.named_steps['selectkbest']
all_names = X_train.columns
selected_mask = selector.get_support()
selected_names = all_names[selected_mask]
unselected_names = all_names[~selected_mask]

print('Features selected:')
for name in selected_names:
    print(name)

print()
print('Features not selected:')
for name in unselected_names:
    print(name)"""

Cross Validation Score: 0.7127497194163861
Best Parameters: {'logisticregression__class_weight': None, 'selectkbest__k': 31}


"# Which features were selected?\nselector = gridsearch.best_estimator_.named_steps['selectkbest']\nall_names = X_train.columns\nselected_mask = selector.get_support()\nselected_names = all_names[selected_mask]\nunselected_names = all_names[~selected_mask]\n\nprint('Features selected:')\nfor name in selected_names:\n    print(name)\n\nprint()\nprint('Features not selected:')\nfor name in unselected_names:\n    print(name)"

In [22]:
#Get the best model and check it against test data set.

# Predict with X_test features
y_pred = gridsearch.predict(X_test)

# Compare predictions to y_test labels
test_score = accuracy_score(y_test, y_pred)
print('Accuracy Score on test data set:', test_score)

Accuracy Score on test data set: 0.708013468013468


In [0]:
test_features_df = pd.read_csv('test_features.csv', na_values=nan_values_list)

# Iteration 1
# X_submission = test_features_df.select_dtypes(include=np.number)


# Iteration 2
X_submission = test_features_df.drop(columns =['id', 'date_recorded', 
                                                   'funder', 'installer', 
                                                   'wpt_name', 'subvillage', 
                                                   'ward', 'scheme_name'])

# Predict with X_submission features
y_submission = gridsearch.predict(X_submission)

y_submission_df = pd.DataFrame(y_submission, columns=['status_group'])
output_for_submission = test_features_df.join(y_submission_df).loc[:, ['id','status_group']]

In [25]:
output_for_submission.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,non functional
3,45559,non functional
4,49871,functional


In [29]:
print(output_for_submission.status_group.value_counts())
print(output_for_submission.shape)


functional                 10554
non functional              3670
functional needs repair      134
Name: status_group, dtype: int64
(14358, 2)


In [0]:
print(output_for_submission.to_csv(index=False))