In [None]:
 from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold
# from sklearn.metrics import accuracy_score
import datetime
# from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# from sklearn.feature_selection import mutual_info_regression

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load dataset

In [None]:
# Import data
X_full = pd.read_csv('/content/drive/My Drive/PUMPITUP/TrainingSetValues.csv')
X_test_full = pd.read_csv('/content/drive/My Drive/PUMPITUP/TestSetValues.csv')
Y_full = pd.read_csv('/content/drive/My Drive/PUMPITUP/TrainingSetLabels.csv')
submission_file = pd.read_csv("/content/drive/My Drive/PUMPITUP/Format.csv")

X_full.set_index('id',inplace=True)
Y_full.set_index('id',inplace=True)
X_test_full.set_index('id',inplace=True)
X_test_full.reset_index(inplace=True)
submission_file.set_index('id',inplace=True)

Handle missing values

In [None]:
df_merged = pd.merge(Y_full, X_full, how = 'inner', left_index = True,right_index=True)
df_merged.reset_index(inplace=True)

In [None]:
def date_parser(df):
    date_recorder = list(map(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d'),
                             df['date_recorded'].values))
    df['year_recorder'] = list(map(lambda x: int(x.strftime('%Y')), date_recorder))
    df['weekday_recorder'] = list(map(lambda x: int(x.strftime('%w')), date_recorder))
    df['yearly_week_recorder'] = list(map(lambda x: int(x.strftime('%W')), date_recorder))
    df['month_recorder'] = list(map(lambda x: int(x.strftime('%m')), date_recorder))
    df['age'] = df['year_recorder'].values - df['construction_year'].values
    return df

In [None]:
def createDecadeColumn(df):
  df['construction_year'].replace(to_replace = 0, value = 2000, inplace=True)
  df['decade'] = df['construction_year']

  # dividing the column decades
  df['decade'].replace(to_replace = (1960,1961,1962,1963,1964,1965,1966,1967,1968,1969),
                          value ='60s' , inplace=True)
  df['decade'].replace(to_replace = (1970,1971,1972,1973,1974,1975,1976,1977,1978,1979),
                          value ='70s' , inplace=True)
  df['decade'].replace(to_replace = (1980,1981,1982,1983,1984,1985,1986,1987,1988,1989),
                          value ='80s' , inplace=True)
  df['decade'].replace(to_replace = (1990,1991,1992,1993,1994,1995,1996,1997,1998,1999),
                          value ='90s' , inplace=True)
  df['decade'].replace(to_replace = (2000,2001,2002,2003,2004,2005,2006,2007,2008,2009),
                          value ='00s' , inplace=True)
  df['decade'].replace(to_replace = (2010,2011,2012,2013),
                          value ='10s' , inplace=True)

  return df

In [None]:
def reOrderInstallerColumn(df):
  df['installer'].fillna(value='Unknown',inplace=True)

  df['installer'].replace(to_replace = '0', value ='Unknown' , inplace=True)

  df['installer'].replace(to_replace = ('District Water Department', 'District water depar','Distric Water Department'),
                          value ='District water department' , inplace=True)

  df['installer'].replace(to_replace = ('FinW','Fini water','FINI WATER'), value ='Fini Water' , inplace=True)
  df['installer'].replace(to_replace = 'JAICA', value ='Jaica' , inplace=True)

  df['installer'].replace(to_replace = ('COUN', 'District COUNCIL', 'DISTRICT COUNCIL','District Counci', 
                                        'District Council','Council','Counc','District  Council','Distri'),
                                      value ='District council' , inplace=True)

  df['installer'].replace(to_replace = ('RC CHURCH', 'RC Churc', 'RC','RC Ch','RC C', 'RC CH','RC church', 
                                        'RC CATHORIC',) , value ='RC Church' , inplace=True)

  df['installer'].replace(to_replace = ('Central Government','Tanzania Government',
                                        'central government','Cental Government', 'Cebtral Government', 
                                        'Tanzanian Government','Tanzania government', 'Centra Government' ,
                                        'CENTRAL GOVERNMENT', 'TANZANIAN GOVERNMENT','Central govt', 'Centr', 
                                        'Centra govt') , value ='Central government' , inplace=True)

  df['installer'].replace(to_replace = ('World vision', 'World Division','World Vision'),
                                          value ='world vision' , inplace=True)

  df['installer'].replace(to_replace = ('Unisef','UNICEF'),value ='Unicef' , inplace=True)
  df['installer'].replace(to_replace = 'DANID', value ='DANIDA' , inplace=True)

  df['installer'].replace(to_replace = ('villigers', 'villager', 'Villagers', 'Villa', 'Village', 'Villi', 
                                        'Village Council','Village Counil', 'Villages', 'Vill', 'Village community', 
                                        'Villaers', 'Village Community', 'Villag','Villege Council', 'Village council',
                                        'Village  Council','Villagerd', 'Villager', 'Village Technician',
                                        'Village Office','Village community members'),
                                          value ='villagers' , inplace=True)

  df['installer'].replace(to_replace =('Commu','Communit','commu','COMMU', 'COMMUNITY') ,
                                          value ='Community' , inplace=True)

  df['installer'].replace(to_replace = ('GOVERNMENT', 'GOVER', 'GOVERNME', 'GOVERM','GOVERN','Gover','Gove',
                                        'Governme','Governmen' ) ,value ='Government' , inplace=True)

  df['installer'].replace(to_replace = 'Hesawa' ,value ='HESAWA' , inplace=True)

  df['installer'].replace(to_replace = ('Colonial Government') , value ='Colonial government' , inplace=True)
  df['installer'].replace(to_replace = ('Government of Misri') , value ='Misri Government' , inplace=True)
  df['installer'].replace(to_replace = ('Italy government') , value ='Italian government' , inplace=True)
  df['installer'].replace(to_replace = ('British colonial government') , value ='British government' , inplace=True)
  df['installer'].replace(to_replace = ('Concern /government') , value ='Concern/Government' , inplace=True)
  df['installer'].replace(to_replace = ('Village Government') , value ='Village government' , inplace=True)
  df['installer'].replace(to_replace = ('Government and Community') , value ='Government /Community' , inplace=True)
  df['installer'].replace(to_replace = ('Cetral government /RC') , value ='RC church/Central Gover' , inplace=True)
  df['installer'].replace(to_replace = ('Government /TCRS','Government/TCRS') , value ='TCRS /Government' , inplace=True)
  df['installer'].replace(to_replace = ('ADRA /Government') , value ='ADRA/Government' , inplace=True)

  return df

In [None]:
def dropnAddMissingValuesforSpecialColums(df):
  df['funder'].fillna(value='Unknown',inplace=True)
  df['public_meeting'].fillna(value=True,inplace=True)
  df['permit'].fillna(value=True, inplace=True)

  df['funder'].replace(to_replace = '0', value ='Unknown' , inplace=True)
  df['longitude'].replace(to_replace = 0 , value =35.15, inplace=True)
  df['population'].replace(to_replace = 0 , value =281, inplace=True)

  df.drop(columns=['subvillage', 'num_private', 'date_recorded', 'amount_tsh', 'wpt_name','scheme_name','id','region_code', 
                  'management_group','scheme_management','quantity_group','source_class','source_type','quality_group',
                'payment_type','extraction_type_class','extraction_type', 'waterpoint_type_group', 'recorded_by'],inplace=True )
  return df

In [None]:
df = dropnAddMissingValuesforSpecialColums( reOrderInstallerColumn( createDecadeColumn( date_parser(df_merged.copy()))))

In [None]:
df.dropna(axis=0, subset=["status_group"], inplace=True)
Y = pd.DataFrame(df.status_group)        
df.drop(["status_group"], axis=1, inplace=True)
X = df.copy()

In [None]:
X_testNew = dropnAddMissingValuesforSpecialColums( reOrderInstallerColumn( createDecadeColumn( date_parser( X_test_full.copy()))))

In [None]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols

def encoding(df):

  # Preprocessing for numerical data
  numerical_transformer = SimpleImputer(strategy='constant')
  df[numerical_cols] = numerical_transformer.fit_transform(df[numerical_cols])

  # Preprocessing for categorical data
  categorical_transformer = SimpleImputer(strategy='most_frequent')
  df[categorical_cols] = categorical_transformer.fit_transform(df[categorical_cols])

  for i in categorical_cols:
    df[i] = LabelEncoder().fit_transform(df[i].values)
  
  return df


# # Preprocessing for numerical data
# numerical_transformer = SimpleImputer(strategy='constant')

# # Preprocessing for categorical data
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])


# # Bundle preprocessing for numerical and categorical data
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

In [None]:
# Change labels to ints in order to use as y vector
label_encoder = LabelEncoder()
Y["status_group"] = label_encoder.fit_transform(Y.status_group.values)
Y.head()

Unnamed: 0,status_group
0,0
1,0
2,0
3,2
4,0


Split dataset

In [None]:
X_train = encoding(X[my_cols].copy())
Y_train = Y.copy()
X_test = encoding(X_testNew[my_cols].copy())

# X_train = X[my_cols].copy()
# Y_train = Y.copy()
# X_test = X_testNew[my_cols].copy()

# X_test.head()

**model train testing for finding better accuracy**

In [None]:
def modelTraining(n_est, depth):
  model = XGBClassifier(n_estimators=n_est, max_depth=depth)

  model.fit(X_train, Y_train.values.ravel())

  kfold = KFold(n_splits=10, shuffle=True)
  kf_cv_scores = cross_val_score(model, X_train, Y_train.values.ravel(), cv=kfold )
  return kf_cv_scores.mean()

    

In [None]:
cols = ['n_estimates', 'depth', 'accuracy']
trainingResults = pd.DataFrame(columns = cols)

for n_est in range(1, 11):
  for depth in range(3, 6):
    acc = modelTraining(n_est*50, depth)
    print(n_est*50, depth, acc)
    data = [{'n_estimates': n_est*50,'depth': depth,'accuracy': acc}]
    trainingResults = trainingResults.append(data,ignore_index=True,sort=False)

trainingResults.to_csv(path_or_buf="/content/drive/My Drive/PUMPITUP/accuracyStack.csv")
trainingResults.loc(trainingResults['accuracy'].max())

In [None]:

trainingResults.head()


NameError: ignored

**testing done**

In [None]:
model = XGBClassifier(n_estimators=500, max_depth=5)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [None]:
# Bundle preprocessing and modeling code in a pipeline
# clf = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])

In [None]:
# clf.fit(X_train, Y_train)
# clf.fit(X_train, Y_train.values.ravel())
model.fit(X_train, Y_train.values.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
kfold = KFold(n_splits=10, shuffle=True)
# kf_cv_scores = cross_val_score(clf, X_train, Y_train.values.ravel(), cv=kfold )
kf_cv_scores = cross_val_score(model, X_train, Y_train.values.ravel(), cv=kfold )
print("K-fold CV average score: %.4f" % kf_cv_scores.mean())

K-fold CV average score: 0.8011


In [None]:
preds = model.predict(X_test)
# preds = clf.predict(X_test)
submission = label_encoder.inverse_transform(preds)
submission_file['status_group'] = submission
submission_file.to_csv(path_or_buf="/content/drive/My Drive/PUMPITUP/Submission.csv")