In [212]:
#Importing Libraries
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
from sklearn.model_selection import cross_val_score, cross_val_predict
import os

In [213]:
#set the path of the raw data
raw_data_path=os.path.join(os.path.pardir,'Documents','Dataset')
result_path=os.path.join(os.path.pardir,'Documents','novartis_results')
train_data_path=os.path.join(raw_data_path,'train.csv')
test_data_path=os.path.join(raw_data_path,'test.csv')

In [214]:
#importing traing and test file for data processing, training will happen with train and prediction for test
train_df = pd.read_csv(train_data_path, index_col="INCIDENT_ID")
test_df=pd.read_csv(test_data_path,index_col='INCIDENT_ID')

In [215]:
test_df['MULTIPLE_OFFENSE']=-999 #default value, to concatenate both files so that data can be processed together

In [216]:
#Concatenated
df=pd.concat((train_df,test_df),axis=0)

In [217]:
#Info of whole dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39759 entries, CR_102659 to CR_33545
Data columns (total 17 columns):
DATE                39759 non-null object
X_1                 39759 non-null int64
X_2                 39759 non-null int64
X_3                 39759 non-null int64
X_4                 39759 non-null int64
X_5                 39759 non-null int64
X_6                 39759 non-null int64
X_7                 39759 non-null int64
X_8                 39759 non-null int64
X_9                 39759 non-null int64
X_10                39759 non-null int64
X_11                39759 non-null int64
X_12                39450 non-null float64
X_13                39759 non-null int64
X_14                39759 non-null int64
X_15                39759 non-null int64
MULTIPLE_OFFENSE    39759 non-null int64
dtypes: float64(1), int64(15), object(1)
memory usage: 5.5+ MB


In [218]:
#extracting date parts from dte column
df['DATE_PARSED'] = pd.to_datetime(df['DATE'])
df['Month'] = df['DATE_PARSED'].dt.month
df['year'] = df['DATE_PARSED'].dt.year
df['day'] = df['DATE_PARSED'].dt.day
df['day_of_week']=df['DATE_PARSED'].dt.dayofweek
df['week']=df['DATE_PARSED'].dt.week

In [219]:
#removing date column after extraction
df.drop(columns=['DATE','DATE_PARSED'],inplace=True)

In [220]:
#Moving target column at last
df = df[[c for c in df if c not in ['MULTIPLE_OFFENSE']]  + ['MULTIPLE_OFFENSE']]
df.head(2)

Unnamed: 0_level_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,...,X_12,X_13,X_14,X_15,Month,year,day,day_of_week,week,MULTIPLE_OFFENSE
INCIDENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CR_102659,0,36,34,2,1,5,6,1,6,1,...,1.0,92,29,36,7,2004,4,6,27,0
CR_189752,1,37,37,0,0,11,17,1,6,1,...,1.0,103,142,34,7,2017,18,1,29,1


In [221]:
#Checking for NaN values
df[df.isnull().any(axis=1)]

Unnamed: 0_level_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,...,X_12,X_13,X_14,X_15,Month,year,day,day_of_week,week,MULTIPLE_OFFENSE
INCIDENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CR_189499,0,36,34,2,1,1,0,1,5,1,...,,103,93,43,2,2017,10,4,6,1
CR_186311,0,9,10,7,3,2,7,2,5,1,...,,10,80,0,11,2017,22,2,47,1
CR_189730,5,36,34,2,1,18,13,1,5,1,...,,103,87,43,10,2017,7,5,40,1
CR_189713,5,36,34,2,1,13,18,1,5,1,...,,92,29,43,3,2017,4,5,9,1
CR_196035,0,33,32,2,1,7,1,0,5,1,...,,111,93,43,11,2018,29,3,48,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CR_196615,5,36,34,2,1,18,13,1,5,1,...,,92,93,43,1,2018,30,1,5,-999
CR_193485,1,9,10,7,3,15,10,0,5,1,...,,72,93,46,5,2018,7,0,19,-999
CR_188971,0,33,32,2,1,6,4,1,6,1,...,,112,93,43,5,2017,16,1,20,-999
CR_189567,0,36,34,2,1,1,0,1,5,1,...,,92,93,0,2,2017,22,2,8,-999


In [222]:
#Unique values in X_12
df.X_12.value_counts()

1.0     26204
0.0      8517
2.0      3420
3.0       797
4.0       276
5.0       101
6.0        59
8.0        18
7.0        14
10.0       11
9.0         9
11.0        6
20.0        3
12.0        2
14.0        2
40.0        2
15.0        2
90.0        1
18.0        1
16.0        1
58.0        1
50.0        1
17.0        1
30.0        1
Name: X_12, dtype: int64

In [223]:
#After exploring data, imputaion with same value of X_10
df.X_12.fillna(df.X_10,inplace=True)

In [224]:
#reorder Columns
columns=[column for column in df.columns if column!='MULTIPLE_OFFENSE']
columns=columns+['MULTIPLE_OFFENSE']
df=df[columns]

In [225]:
#After processing, Seperate train and test
#train data
train=df.loc[df.MULTIPLE_OFFENSE!=-999]
#test data
columns=[column for column in df.columns if column!='MULTIPLE_OFFENSE']
test=df.loc[df.MULTIPLE_OFFENSE==-999,columns]

In [226]:
#Extracting independent variables(input) and dependent variable(output)
X = train.drop(columns=['MULTIPLE_OFFENSE'])
y = train.MULTIPLE_OFFENSE

In [246]:
#model creation
from xgboost import XGBClassifier
clf = XGBClassifier(booster='gbtree',colsample_bytree=0.5,gamma=0.5,learning_rate=0.3,max_depth=6
                   ,scale_pos_weight=0.9,subsample=1)
clf.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=0.9, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [247]:
#Kfold cross validation with K=6
scores = cross_val_score(clf, X, y, cv=6)
print("Cross-validated scores:",scores)

Cross-validated scores: [0.99924547 0.99949698 1.         1.         0.99924547 0.99924547]


In [248]:
#Prediction for train data
k_predictions = cross_val_predict(clf, X, y, cv=6)
print(k_predictions)

[0 1 1 ... 1 1 1]


In [249]:
# find the mean score from the k-fold models usinf cross_val_score
kfold_scores = cross_val_score(clf, X, y, cv=6)
print(kfold_scores.mean())

0.999538900067069


In [250]:
#accuracy score
print('Accuracy of model is: {0:.3f}'.format(accuracy_score(y, k_predictions)))

Accuracy of model is: 1.000


In [251]:
#confusion matrix
print('confusion of model is: {0}'.format(confusion_matrix(y, k_predictions)))

confusion of model is: [[ 1060     8]
 [    3 22785]]


In [252]:
#precision and recall score
print('precision of model is: {0}'.format(precision_score(y, k_predictions)))
print('recall of model is: {0}'.format(recall_score(y, k_predictions)))

precision of model is: 0.9996490150484798
recall of model is: 0.9998683517640864


In [253]:
#Prediction for test data
test_prediction = clf.predict(test)

In [254]:
#Format of output
df_submission=pd.DataFrame({'INCIDENT_ID':test.index,'MULTIPLE_OFFENSE':test_prediction})

In [255]:
#checking format of output
df_submission.head()

Unnamed: 0,INCIDENT_ID,MULTIPLE_OFFENSE
0,CR_195453,1
1,CR_103520,1
2,CR_196089,1
3,CR_112195,1
4,CR_149832,1


In [256]:
#path for output
submission_file_path=os.path.join(result_path,'xgboost8.csv')

In [257]:
#saving in csv
df_submission.to_csv(submission_file_path,index=False)

###### ----------------------------------------------------------------END---------------------------------------------------------

In [236]:
#Hyper parameter optimazation
xgb=XGBClassifier(random_state=0) #{'colsample_bytree': 0.5, 'gamma': 0.5, 'max_depth': 6, 'subsample': 1}|max_depth=5

In [237]:
from sklearn.model_selection import GridSearchCV
parameters={'gamma': [0.5],'max_depth':[6],'learning_rate':[0.3],'subsample':[1],'colsample_bytree': [0.5],
          'booster':['gbtree'] ,'scale_pos_weight':[0.8,0.9]}
clf_new=GridSearchCV(xgb,param_grid=parameters,cv=3)

In [238]:
clf_new.fit(X,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constrai...
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             iid='warn', n_jobs=None,
             param_grid={'booster': ['gbtree'], 'colsam

In [239]:
clf_new.best_params_

{'booster': 'gbtree',
 'colsample_bytree': 0.5,
 'gamma': 0.5,
 'learning_rate': 0.3,
 'max_depth': 6,
 'scale_pos_weight': 0.8,
 'subsample': 1}