In [None]:
import pandas as pd
import numpy as np

In [None]:
root_dir = {root_dir}
train_dir = root_dir + 'clean.csv'
test_dir = root_dir + 'clean_test.csv'

train = pd.read_csv(train_dir)
test = pd.read_csv(test_dir)

In [None]:
train.shape,test.shape

In [None]:
#drop unnamed column 
train.drop(train.columns[train.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
test.drop(test.columns[test.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

target=train.pop('status_group')

# new col to identify test/train data
train['is_test']=0
test['is_test']=1

In [None]:
train.shape,test.shape

In [None]:
train_test = pd.concat([train, test])
train_test.info()

In [None]:
train_test['funder'] = pd.factorize(train_test['funder'])[0]
train_test['installer'] = pd.factorize(train_test['installer'])[0]
train_test['scheme_management'] = pd.factorize(train_test['scheme_management'])[0]
train_test['extraction_type'] = pd.factorize(train_test['extraction_type'])[0]
train_test['management'] = pd.factorize(train_test['management'])[0]
# train_test['payment_type'] = pd.factorize(train_test['payment_type'])[0]
# train_test['water_quality'] = pd.factorize(train_test['water_quality'])[0]
# train_test['quantity'] = pd.factorize(train_test['quantity'])[0]
train_test['source'] = pd.factorize(train_test['source'])[0]
# train_test['waterpoint_type'] = pd.factorize(train_test['waterpoint_type'])[0]
# train_test['basin'] = pd.factorize(train_test['basin'])[0]
train_test['region'] = pd.factorize(train_test['region'])[0]
train_test['lga'] = pd.factorize(train_test['lga'])[0]
train_test['district_code'] = pd.factorize(train_test['district_code'])[0]
train_test['operational_year'] = pd.factorize(train_test['operational_year'])[0]
len(train_test.basin.unique())

In [None]:
y1 = pd.get_dummies(train_test.payment_type,prefix = 'payment')
y2 = pd.get_dummies(train_test.water_quality,prefix = 'quality')
y3 = pd.get_dummies(train_test.quantity,prefix = 'quantity')
y4 = pd.get_dummies(train_test.waterpoint_type,prefix = 'waterpoint_type')
y5 = pd.get_dummies(train_test.basin,prefix = 'basin')

In [None]:
train_test = pd.concat([train_test,y1,y2,y3],axis = 1)
train_test.drop(['payment_type','water_quality','quantity','waterpoint_type','basin'],axis =1, inplace=True)

df_train = train_test[train_test["is_test"] == 0]
df_test = train_test[train_test["is_test"] == 1]

df_train.drop(["is_test"], axis=1, inplace=True)
df_train.drop(['id'],axis=1, inplace=True)
df_test.drop(["is_test"], axis=1, inplace=True)

train_test.info()

In [None]:
X = df_train
y = target

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_fc = RandomForestClassifier(n_estimators=1000)

In [None]:
#from sklearn.svm import SVC
#clf = SVC()

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(objective = 'multi:softmax',
                      booster = 'gbtree', max_depth = 14,
                      nrounds = 'min.error.idx', 
                      num_class = 4,
                      maximize = False, 
                      eval_metric = 'merror',
                      eta = .2,
                      colsample_bytree = .4)

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# knn=KNeighborsClassifier(n_neighbors=1000)

In [None]:
from sklearn.ensemble import VotingClassifier
ensembler = VotingClassifier(estimators=[('xg', model), ('rf', random_fc)],weights=[1,2], voting='hard')

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(ensembler, X, y, cv=3)

# array([0.80328283, 0.80328283, 0.80141414])  0.8173
# array([0.8040404 , 0.80479798, 0.80075758])  0.8165
# array([0.80838384, 0.8109596 , 0.80752525])  0.8153
# array([0.79767677, 0.79782828, 0.79590909])  0.8096
# array([0.79828283, 0.79767677, 0.79540404])

In [None]:
ensembler.fit(X,y)
X.info()

In [None]:
random_fc.fit(X,y)

importances = random_fc.feature_importances_
importances
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

for f in range(X.shape[1]):
    print(X.columns[indices[f]],end=', ')

In [None]:
# #for xgboost
# from sklearn.model_selection import cross_val_score
# print(cross_val_score(model, X, y, cv=3))
# model.fit(X,y)
# importances = model.feature_importances_
# importances
# indices = np.argsort(importances)[::-1]

# # Print the feature ranking
# print("Feature ranking:")

# for f in range(X.shape[1]):
#     print(X.columns[indices[f]],end=', ')

In [None]:
####xg-nrounds=1000-array([ 0.80924242,  0.81217172,  0.81186869])
#xg-max_depth=5-array([ 0.76282828,  0.76823232,  0.76994949])
#xg-max_depth=30-array([ 0.80772727,  0.80782828,  0.805     ])
#random_fc-array([ 0.80479798,  0.80267677,  0.80207071])
#xg-nrounds=500-array([ 0.80939394,  0.80828283,  0.80777778])
#xg-nrounds=1000-array([ 0.80939394,  0.80828283,  0.80777778])
#xg-max_depth=20-array([ 0.8089899 ,  0.80893939,  0.80848485])
#xg-max_depth=16-array([ 0.80838384,  0.80984848,  0.80742424])
#xg-max_depth=12,nfold=5,array([ 0.80939394,  0.80828283,  0.80777778])
#xg-max_depth=12,nfold=6,array([ 0.80939394,  0.80828283,  0.80777778])
#xg-max_depth=12,nfold=6,nrounds=1200,early_stopping_rounds = 9-

In [None]:
X_test=test

In [None]:
X_test.columns

In [None]:
idx=X_test['id']
X_test.drop(['id'],axis=1, inplace=True)
y_pred = ensembler.predict(X_test)

In [None]:
y_pred=pd.DataFrame(y_pred)
y_pred['id']=idx
y_pred.columns=['status_group','id']
y_pred=y_pred[['id','status_group']]

In [None]:
y_pred.info()

In [None]:
pd.DataFrame(y_pred).to_csv("submission_clf_4.csv")