<a href="https://colab.research.google.com/github/ShreyasJothish/taarifa_water_pumps/blob/master/DS1_Predictive_Modeling_Challenge_Day_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load the data from Kaggle
!pip install kaggle

# Upgrade the version of Seaborn
!pip install -U seaborn

# Install category_encoders
!pip install category_encoders

Requirement already up-to-date: seaborn in /usr/local/lib/python3.6/dist-packages (0.9.0)


In [2]:
# Mount the drive to download the data from Kaggle
from google.colab import drive
drive.mount('/content/drive')
%env KAGGLE_CONFIG_DIR=/content/drive/My Drive

!kaggle competitions download -c ds1-predictive-modeling-challenge

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
env: KAGGLE_CONFIG_DIR=/content/drive/My Drive
sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
test_features.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train_labels.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train_features.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# Extract the csv files
!unzip train_features.csv.zip 
!unzip train_labels.csv.zip 
!unzip test_features.csv.zip

Archive:  train_features.csv.zip
replace train_features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Archive:  train_labels.csv.zip
replace train_labels.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Archive:  test_features.csv.zip
replace test_features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [0]:
# Generic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
# Loading the independent features as X and
# dependent variable as y
nan_values_list = ['Not Known', 'Unknown', 'None', 'Not known', 'not known', 
                   '-', 'unknown', 'Unknown Installer', '##', 'none']

train_features_df = pd.read_csv('train_features.csv', na_values=nan_values_list)
train_labels_df = pd.read_csv('train_labels.csv')



In [6]:
"""# Test Code to be removed later
train_features_df = train_features_df.sample(5000, random_state=42)
train_labels_df = train_labels_df.sample(5000, random_state=42)
train_labels_df.status_group.value_counts()"""

'# Test Code to be removed later\ntrain_features_df = train_features_df.sample(5000, random_state=42)\ntrain_labels_df = train_labels_df.sample(5000, random_state=42)\ntrain_labels_df.status_group.value_counts()'

In [0]:
def atleast(row, value_count_series, count=5):
  # Identify items who have funded atleast 5 pumps
  if str(row) == "nan":
    return np.nan
  
  value_count = value_count_series.get(row)
  
  if value_count < count:
    return 0
  else:
    return 1

def character_grouping(row):
  # Reduce the dimension based on 1st character else return *
  if str(row) == "nan":
    return np.nan
  
  if row[0].isalpha():
    return row[0].lower()
  else:
    return "*"
  
def classify_lga(row):
  # Classify lga into Rural, Urban and others
  if str(row) == "nan":
    return np.nan
  
  if row.lower().find('rural'):
    return "rural"
  elif row.lower().find('urban'):
    return "urban"
  else:
    return "other"
  
def prefix_grouping(row, prefix_count=3):
  # Reduce the dimension based on 1st character else return *
  if str(row) == "nan":
    return np.nan
  
  if prefix_count > len(row):
    return "#"
  
  if row[0:prefix_count].isalpha():
    return row[0:prefix_count].lower()
  else:
    return "*"

In [0]:
def feature_engineering(df):
  # Create a column to indicate funder with atleast 5 pumps maintained.
  value_count_funder = df.funder.value_counts()
  df['funder_aleast_5'] = df['funder'].apply(atleast, 
                                            args=(value_count_funder,))
  
  # Create a column to indicate installer with atleast 5 pumps maintained.
  value_count_installer = df.installer.value_counts()
  df['installer_aleast_5'] = df['installer'].apply(atleast, 
                                            args=(value_count_installer,))
  
  # Apply mean for missing values of latitude and longitude
  mean_longitude = df['longitude'].mean()
  df['longitude'] = df['longitude'].apply(lambda x: mean_longitude \
                                          if round(x, 2) == 0 else x)
  mean_latitude = df['latitude'].mean()
  df['latitude'] = df['latitude'].apply(lambda x: mean_latitude \
                                        if round(x, 2) == 0 else x)
  
  # Grouping wpt_name, subvillage based on 1st alphabet
  df['wpt_name_character_grouping'] = df['wpt_name'].apply(character_grouping)
  df['subvillage_character_grouping'] = \
    df['subvillage'].apply(character_grouping)
  
  # Classify lga based on Rural, Urban and others
  df['lga_classify'] = df['lga'].apply(classify_lga)
  
  # Grouping ward, scheme_name based on 1st alphabet
  df['ward_character_grouping'] = df['ward'].apply(character_grouping)
  df['scheme_name_character_grouping'] = \
    df['scheme_name'].apply(character_grouping)
  
  # Grouping based on prefix
  df['funder_prefix_grouping'] = df['funder'].apply(prefix_grouping)
  df['installer_prefix_grouping'] = df['installer'].apply(prefix_grouping)
  df['wpt_name_prefix_grouping'] = df['wpt_name'].apply(prefix_grouping)
  df['subvillage_prefix_grouping'] = df['subvillage'].apply(prefix_grouping)
  df['lga_prefix_grouping'] = df['lga'].apply(prefix_grouping)
  df['ward_prefix_grouping'] = df['ward'].apply(prefix_grouping)
  df['scheme_name_prefix_grouping'] = df['scheme_name'].apply(prefix_grouping)

feature_engineering(train_features_df)

In [0]:
# Selecting independent and dependent variables.

X = train_features_df.drop(columns=['id', 'funder', 'installer', 'wpt_name', 
                                    'subvillage', 'lga','ward','scheme_name'])
y = train_labels_df.status_group

In [0]:
# Split data into train and test using k-fold cross-validation
# with independent test data set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    random_state=42
                                                   )

In [11]:
# Get quick initial metrics estimate.

# Using sklearn accuracy_score
import numpy as np
from sklearn.metrics import accuracy_score

majority_class = y_train.mode()[0]
prediction = np.full(shape=y_train.shape, 
                     fill_value=majority_class)

print(f'accuracy score {accuracy_score(y_train, prediction)}')


# Using simple pandas value counts method
print(y_train.value_counts(normalize=True))

accuracy score 0.542334455667789
functional                 0.542334
non functional             0.384871
functional needs repair    0.072795
Name: status_group, dtype: float64


In [0]:
# Data pre-processing, Feature selection and Model selection.

# Imports for pipeline
from sklearn.pipeline import make_pipeline

import category_encoders as ce
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier

In [0]:
# Create pipeline
pipeline = make_pipeline(\
                         ce.BinaryEncoder(),
                         RobustScaler(),
                         XGBClassifier())

In [18]:
# Model validation.
from sklearn.model_selection import GridSearchCV 

param_grid = {
    'xgbclassifier__max_depth': [3, 4, 5, 6],
    'xgbclassifier__learning_rate': [0.1, 0.2, 0.3, 0.5]
}

"""param_grid = {
    'xgbclassifier__max_depth': [5],
    'xgbclassifier__learning_rate': [0.2]
}"""

gridsearch = GridSearchCV(pipeline, param_grid=param_grid, cv=3,
                         scoring='accuracy', verbose=20)

gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3 ....


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3, score=0.7400848427715305, total= 1.3min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3 ....


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3, score=0.7456565656565657, total= 1.3min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3 ....


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.6min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3, score=0.7410600040406762, total= 1.3min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=4 ....


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.9min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=4, score=0.7565147128139519, total= 1.5min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=4 ....


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.4min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=4, score=0.7597306397306397, total= 1.5min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=4 ....


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.9min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=4, score=0.7583002222371877, total= 1.5min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5 ....


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  8.4min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5, score=0.7699144838731399, total= 1.7min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5 ....


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 10.2min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5, score=0.7703703703703704, total= 1.7min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5 ....


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 11.9min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5, score=0.7700181830426291, total= 1.7min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=6 ....


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 13.6min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=6, score=0.7815635310753485, total= 1.9min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=6 ....


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 15.6min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=6, score=0.781077441077441, total= 1.9min
[CV] xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=6 ....


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed: 17.5min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=6, score=0.7779648461175837, total= 1.9min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=3 ....


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 19.5min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=3, score=0.7550333310888155, total= 1.3min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=3 ....


[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed: 20.8min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=3, score=0.7615488215488215, total= 1.3min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=3 ....


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed: 22.1min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=3, score=0.7561451949626238, total= 1.3min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=4 ....


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 23.4min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=4, score=0.7685004376809642, total= 1.5min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=4 ....


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed: 24.9min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=4, score=0.7696969696969697, total= 1.5min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=4 ....


[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed: 26.4min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=4, score=0.7696141154286484, total= 1.5min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=5 ....


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 27.9min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=5, score=0.7824388930038382, total= 1.7min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=5 ....


[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed: 29.7min remaining:    0.0s


[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=5, score=0.7833670033670034, total= 1.7min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=5 ....
[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=5, score=0.78146676543875, total= 1.7min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=6 ....
[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=6, score=0.7906538280250488, total= 1.9min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=6 ....
[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=6, score=0.791043771043771, total= 1.9min
[CV] xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=6 ....
[CV]  xgbclassifier__learning_rate=0.2, xgbclassifier__max_depth=6, score=0.7879318472624419, total= 1.9min
[CV] xgbclassifier__learning_rate=0.3, xgbclassifier__max_depth=3 ....
[CV]  xgbclassifier__learning_rate=0.3, xgbclassifier__max_depth=3, score=0.7603528381927143, total= 1.3min


[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 77.8min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('binaryencoder', BinaryEncoder(cols=None, drop_invariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0)), ('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('xgbclassifier', XGBClassi...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'xgbclassifier__max_depth': [3, 4, 5, 6], 'xgbclassifier__learning_rate': [0.1, 0.2, 0.3, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=20)

In [20]:
# Interpret the results.

# Best cross validation score
print('Cross Validation Score:', gridsearch.best_score_)

# Best parameters which resulted in the best score
print('Best Parameters:', gridsearch.best_params_)

Cross Validation Score: 0.7940965207631874
Best Parameters: {'xgbclassifier__learning_rate': 0.5, 'xgbclassifier__max_depth': 6}


Cross Validation Score: 0.7533333333333333
Best Parameters: {'xgbclassifier__learning_rate': 0.2, 'xgbclassifier__max_depth': 5}

Cross Validation Score: 0.7576
Best Parameters: {'xgbclassifier__learning_rate': 0.2, 'xgbclassifier__max_depth': 5}


In [21]:
#Get the best model and check it against test data set.

# Predict with X_test features
y_pred = gridsearch.predict(X_test)

# Compare predictions to y_test labels
test_score = accuracy_score(y_test, y_pred)
print('Accuracy Score on test data set:', test_score)

Accuracy Score on test data set: 0.7999326599326599


In [0]:
test_features_df = pd.read_csv('test_features.csv', na_values=nan_values_list)
feature_engineering(test_features_df)

X_submission = test_features_df.drop(columns =['id', 'funder', 'installer', 'wpt_name', 'subvillage', 
                                    'lga','ward','scheme_name'])

# Predict with X_submission features
y_submission = gridsearch.predict(X_submission)

y_submission_df = pd.DataFrame(y_submission, columns=['status_group'])
output_for_submission = test_features_df.join(y_submission_df).loc[:, ['id','status_group']]

In [23]:
output_for_submission.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [24]:
print(output_for_submission.status_group.value_counts())
print(output_for_submission.shape)

functional                 8740
non functional             5107
functional needs repair     511
Name: status_group, dtype: int64
(14358, 2)


In [25]:
print(output_for_submission.to_csv(index=False))

id,status_group
50785,non functional
51630,functional
17168,functional
45559,non functional
49871,functional
52449,functional
24806,functional
28965,non functional
36301,non functional
54122,functional
419,functional
45750,non functional
653,non functional
14017,non functional
40228,functional
27714,functional
28785,non functional
18532,non functional
69961,functional
55083,non functional
8691,non functional
30331,non functional
70970,functional
61136,functional
28799,non functional
46825,non functional
44718,functional needs repair
37350,functional
65738,functional
3271,non functional
42658,functional
67523,functional
37034,functional
42432,functional
27420,functional
707,functional
48489,non functional
51008,non functional
72562,non functional
10275,non functional
53540,non functional
43291,non functional
62343,non functional
1649,functional
8822,functional
10352,non functional
50878,functional
15266,functional
51956,functional
44099,non functional
59,non functional
51359,non functio

In [0]:
print(output_for_submission.status_group.value_counts())
print(output_for_submission.shape)

functional                 9578
non functional             4484
functional needs repair     296
Name: status_group, dtype: int64
(14358, 2)
