In [None]:
## import libraries

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt 

##graphics
import seaborn as sns 

## tensorflow and keras
import math
import sklearn

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, balanced_accuracy_score, roc_auc_score, make_scorer, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import SMOTE

import xgboost as xgb



**no need to run those cells each time**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

loading the data into splited arrays

In [None]:
## loading the data
def loadfile(name):
    with open(name) as f:
        data = pd.read_csv(f, delimiter=",", dtype={"Id": str})
    labels = data["label"].values
    Ids = data["Id"].values
    data.drop(labels = ["Id", "label", 'Feature0'], axis=1, inplace = True)
    features = data.values
    return (features, labels, Ids)

train_data, train_labels, train_Ids = loadfile("/content/drive/MyDrive/train.csv")
test_data, test_labels, test_Ids = loadfile("/content/drive/MyDrive/test.csv")


loading the data into DataFrame in order to better understand it



*  dropping Feature0 - all values are 0.




In [None]:
##describing the data
train_Data_1=pd.read_csv('/content/drive/MyDrive/train.csv')
test_Data_1=pd.read_csv('/content/drive/MyDrive/test.csv')
train_Data_1.drop('Feature0', axis=1, inplace=True)
test_Data_1.drop('Feature0', axis=1, inplace=True)
corr = train_Data_1.corr()

In [None]:
f, ax = plt.subplots(figsize=(12, 12))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0, square=True, cbar_kws={"shrink": .5},annot=False)

plt.show()

In [None]:
#spliting the data into train and validation sets
x_train , x_validation ,y_train, y_validation = train_test_split(train_data,train_labels, random_state=42, test_size=0.2, stratify=train_labels, shuffle=True)
evalset = [(x_train, y_train), (x_validation,y_validation)]


## Building the XGBmodel

In [None]:
#building the XGBoost model:
#model = xgb.XGBClassifier(max_depth=10, eta=0.1, objective='binary:logistic',epochs=100, n_rounds=50, verbosity=1)
model = xgb.XGBClassifier(alpha=0.5, base_score=0.5, booster='gbtree',colsample_bylevel=0.8, 
                          colsample_bynode=0.8, colsample_bytree=0.4,early_stopping_rounds=50, gamma=0.01, eta=0.2,max_delta_step=0, 
                          max_depth=10, min_child_weight=2, missing=None,n_estimators=100, n_jobs=1, nthread=None,objective='binary:logistic', random_state=500, 
                          reg_alpha=0.05,reg_lambda=2, scale_pos_weight=1, seed=0, silent=None, subsample=0.5, verbosity=1)

## Fitting the model to the DATA
eventually i've trained the model with all the train set because it gave me much better results

In [None]:
model.fit(train_data,train_labels, eval_metric='auc', verbose=True,eval_set=evalset)


## evaluating the model after tuning parameters

## Using GridSearchCV for optimizing parameters

## Round 1
*  'max_depth': [3, 5, 10]
*  'learning_rate':[0.1, 0.3, 0.5]
*  'colsample_bytree':[0.1, 0.3, 0.5]
*  'gamma':[0.001, 0.0001]
*  'alpha' :[3,5,10]

**output**: Best parameters set found on development set:
{'alpha': 3, 'colsample_bytree': 0.5, 'gamma': 0.001, 'learning_rate': 0.1, 'max_depth': 10}

## Round 2
*   'reg_lambda' : [0.0, 1.0 , 10.0]
*   'scale_pos_weight' : [1,3,5]
*   'alpha' : [1,3]
*   'gamma':[0.01, 0.001, 0.006]
*   'colsample_bytree':[0.5, 0.7]
*   'learning_rate':[0.1]
*   'max_depth': [3, 5, 10]
*   n_jobs = 10
*   cv = 3
*   'reg_alpha':[0.0, 1.0 , 10.0]

**output**: (alpha=3, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.5, gamma=0.006,
learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0.0, reg_lambda=0.0, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)

## Round 3
*   'reg_lambda' : [0.0, 1.0 , 10.0]
*   'scale_pos_weight' : [1, 10, 100]
*   'alpha' : [1,3]
*   'gamma':[ 0.006, 0.009]
*   'colsample_bytree':[0.5]
*   'learning_rate':[0.1]
*   'max_depth': [3, 5, 10]
*   n_jobs = 10
*   cv = 3

**output**:alpha=1, base_score=0.5, booster='gbtree', colsample_bylevel=1,colsample_bynode=1, colsample_bytree=0.5, gamma=0.0009,
learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)

## Round 4
*   'alpha' : [0.5,1]
*   'min_child_weight' : [1, 3, 5]
*   'max_delta_step' : [0, 1, 2]
*   n_jobs = -1
*   cv = cv_k
*   seed : [0, 7, 42]

**output:** (*alpha=0.5*, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.0009,
              learning_rate=0.1, *max_delta_step=0*, max_depth=3,
              *min_child_weight=3*, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=1, *seed=42*,
              silent=None, subsample=1, verbosity=1)

## Round 5
*    'n_estimators' = [100,500,1000]
*    'early_stopping_rounds' = [10, 20]
*    'random_state' = [0, 10, 100]

**output:** alpha=1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5,
              early_stopping_rounds=10, gamma=0.0009, learning_rate=0.1,
              max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=100, reg_alpha=0.0,
              reg_lambda=1.0, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

## Round 6
*   'min_child_weight' : [2,3,4]
*   'gamma':[ 0.01, 0.009]
*   'colsample_bytree':[0.4,0.5]
*   'subsample':[0.5, 0.7,1]
*   'random_state' = [100, 1000, 500]
*   'reg_lambda' : [0.5, 1.0 , 5]

** output**: alpha=1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4,
              early_stopping_rounds=10, gamma=0.01, learning_rate=0.1,
              max_delta_step=0, max_depth=3, min_child_weight=4, missing=None,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=500, reg_alpha=0.0,
              reg_lambda=1.0, scale_pos_weight=1, seed=None, silent=None,
              subsample=0.5, verbosity=1)

## for later
'colsample_bylevel':[1,0.5,0.8],
              'colsample_bynode':[1,0.5,0.8], 'colsample_bytree':[0.4,0.5],'subsample':[0.4,0.5]


In [None]:
param_grid = {'gamma':[0.01,0.05,0.007],'early_stopping_rounds':[50,70],'reg_alpha':[0.1,0.05],'colsample_bylevel':[0.5,0.8], 'colsample_bynode':[0.5,0.8], 'colsample_bytree':[0.4,0.5],'subsample':[0.6,0.5], 'max_depth':[3,5,6],'n_extimators':[100,500]}


## Tunning parameters using Grid search CV - very slow

In [None]:
search = GridSearchCV(model, param_grid, scoring='roc_auc', cv=3, verbose=1, n_jobs=-1)
search.fit(x_train,y_train)
search.best_estimator_

# creating the submission file
The submission file contains table with the proteins and the predicted value from the XGB classifier

In [None]:

def create_submission_file(filename, model, test_data, Ids):
    predictions = model.predict_proba(test_data)[:,1]
    my_submission = pd.DataFrame({'Id': Ids, 'prediction': predictions})
    my_submission.to_csv(filename, index=False)

create_submission_file("mysubmission-1NN.csv", model, test_data, test_Ids)

In [None]:
results = model.evals_result()


# plot learning curves
plt.plot(results['validation_0']['auc'], label='train')
#plt.plot(results['validation_1']['auc'], label='test')
# show the legend
plt.legend()
# show the plot
plt.show()
