In [1]:
## imports here !!!
import numpy as np
import pandas as pd
import xgboost as xgb


from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.impute import SimpleImputer


In [2]:
'''
Importing data
'''
train_data= pd.read_csv('iith_foml_2020_train.csv')
train = pd.DataFrame(train_data)
test_data= pd.read_csv('iith_foml_2020_test.csv')
test = pd.DataFrame(test_data)
train.head()


Unnamed: 0,Feature 1 (Discrete),Feature 2 (Discrete),Feature 3 (Discrete),Feature 4 (Discrete),Feature 5 (Discrete),Feature 6 (Discrete),Feature 7 (Discrete),Feature 8 (Discrete),Feature 9,Feature 10,...,Feature 16,Feature 17,Feature 18,Feature 19 (Discrete),Feature 20 (Discrete),Feature 21 (Discrete),Feature 22 (Discrete),Feature 23 (Discrete),Feature 24,Target Variable (Discrete)
0,1404,12,64,14,3,1,1,1,110.502,35775.2,...,,,15.04,104,12,2,32,1409,37677.1,1
1,909,0,235,32,1,1,1,1,-40.448,35779.4,...,2200.3,4900.005,12.03,20,1,0,13,909,25239.1,1
2,654,3,175,2,1,1,1,1,-27.445,35770.4,...,1973.3,10000.004,13.01,1,1,0,13,654,27683.5,1
3,1372,12,382,14,2,0,1,0,0.001,509.2,...,,,,313,12,10,54,1377,39363.2,0
4,786,3,199,2,1,0,1,0,0.001,612.1,...,,,,171,1,5,11,786,40044.4,2


### Analysis of missing values...


In [3]:

def Summarise_missing_values(new_data):
    cols_with_missing = (col for col in new_data.columns if new_data[col].isnull().any())
    num_missing = (new_data[cols_with_missing].isnull()).sum()
    print("Missing values in percent")
    # report the results
    print(round((num_missing/new_data.shape[0]) * 100, 2))
    print("######################################################################################") 

print("****************************For TRAIN data")
Summarise_missing_values(train.copy())
print("****************************For Test data")
Summarise_missing_values(test.copy())


****************************For TRAIN data
Missing values in percent
Feature 9      1.41
Feature 10     0.10
Feature 11     0.10
Feature 12     0.10
Feature 13     0.10
Feature 14     0.10
Feature 15     7.24
Feature 16    67.30
Feature 17    54.93
Feature 18    33.20
Feature 24     0.10
dtype: float64
######################################################################################
****************************For Test data
Missing values in percent
Feature 9      0.94
Feature 14     0.47
Feature 15     7.28
Feature 16    65.49
Feature 17    52.82
Feature 18    26.76
dtype: float64
######################################################################################


###  Filling missing values by imputation through mean...

In [7]:

def imputing_missing_values(df):
    data = df.to_numpy()
    # define the imputer
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer  = imputer.fit(data)
    # transform the dataset
    transformed_values = imputer.transform(data)
    imputed_data = pd.DataFrame(data = transformed_values,columns = df.columns) 
    return imputed_data


imputed_train = imputing_missing_values(train)
imputed_test = imputing_missing_values(test)

imputed_train.head()

Unnamed: 0,Feature 1 (Discrete),Feature 2 (Discrete),Feature 3 (Discrete),Feature 4 (Discrete),Feature 5 (Discrete),Feature 6 (Discrete),Feature 7 (Discrete),Feature 8 (Discrete),Feature 9,Feature 10,...,Feature 16,Feature 17,Feature 18,Feature 19 (Discrete),Feature 20 (Discrete),Feature 21 (Discrete),Feature 22 (Discrete),Feature 23 (Discrete),Feature 24,Target Variable (Discrete)
0,1404.0,12.0,64.0,14.0,3.0,1.0,1.0,1.0,110.502,35775.2,...,1329.779692,3814.420516,15.04,104.0,12.0,2.0,32.0,1409.0,37677.1,1.0
1,909.0,0.0,235.0,32.0,1.0,1.0,1.0,1.0,-40.448,35779.4,...,2200.3,4900.005,12.03,20.0,1.0,0.0,13.0,909.0,25239.1,1.0
2,654.0,3.0,175.0,2.0,1.0,1.0,1.0,1.0,-27.445,35770.4,...,1973.3,10000.004,13.01,1.0,1.0,0.0,13.0,654.0,27683.5,1.0
3,1372.0,12.0,382.0,14.0,2.0,0.0,1.0,0.0,0.001,509.2,...,1329.779692,3814.420516,2061.806852,313.0,12.0,10.0,54.0,1377.0,39363.2,0.0
4,786.0,3.0,199.0,2.0,1.0,0.0,1.0,0.0,0.001,612.1,...,1329.779692,3814.420516,2061.806852,171.0,1.0,5.0,11.0,786.0,40044.4,2.0


In [5]:
'''
Dividing into X, Y
'''
Y = imputed_train['Target Variable (Discrete)']
X = imputed_train.drop('Target Variable (Discrete)',axis = 1)
#################################################################################
'''
Training with stratified K-fold
'''

i=1
kf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True)

param_grid = [{'min_child_weight': np.arange(0.1, 10.1, 2)}] #set of trial values for min_child_weight


for train_index,test_index in kf.split(X,Y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = Y[train_index],Y[test_index]

    model =GridSearchCV(XGBClassifier(), param_grid, cv=10, scoring= 'f1_macro',iid=True)
    model.fit(xtr, ytr)

    print (model.best_params_)
    y_pred=model.predict(xvl)
    y_pred = [round(value) for value in y_pred]
    
    
    print('accuracy_score',accuracy_score(yvl,y_pred) * 100)
    i+=1





1 of kfold 10


KeyboardInterrupt: 

In [72]:
def predict_and_submission(test_data,model,title):### test_data in pd format
    test = pd.DataFrame(test_data)
    predictions = model.predict(test)

    output_test_data = pd.DataFrame() 
    output_test_data['Category'] = predictions.astype(int)

    output_test_data['Id'] = list(np.arange(1,predictions.size+1))

    submission = output_test_data[['Id','Category']]
    submission.to_csv(title, index=False)
    submission.head()
    return submission

##### Plsss change the title........
output = predict_and_submission(test_data,model,title = "S_002.csv")
output.head()

Unnamed: 0,Id,Category
0,1,6
1,2,4
2,3,1
3,4,1
4,5,1
