In [1]:
print('[Info]: Importing Libraries')

import numpy as np
import pandas as pd
import lightgbm as lgb
from pandas import DataFrame
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

[Info]: Importing Libraries


In [2]:
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Helper Functions

def display_missing_data(data_frame):
    return data_frame.isnull().sum()/len(data_frame)*100

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [4]:
print('[Info]: Reading the Data\n')

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

((train_data.shape), (test_data.shape))

[Info]: Reading the Data



((85741, 133), (28580, 132))

In [5]:
print('[Info]: Separating Depending and Independent Varibales')

X_train = train_data.drop(columns=['target_label','IDs'],axis=1)
y = train_data['target_label']

X_tst = test_data.drop(columns=['IDs'],axis=1)
IDs = test_data['IDs']
IDs.to_csv('./ID.csv', index= False)

((X_train.shape), (y.shape), (X_tst.shape))

[Info]: Separating Depending and Independent Varibales


((85741, 131), (85741,), (28580, 131))

In [6]:
print('[Info]: Separating Numerical Data \n')

numerical_data_trn = X_train.select_dtypes(include = [np.number])
numerical_data_tst = X_tst.select_dtypes(include = [np.number])

((numerical_data_trn.shape), (numerical_data_tst.shape))

[Info]: Separating Numerical Data 



((85741, 112), (28580, 112))

### Removing Similar features based on correlaltion

In [7]:
corr_features = correlation(numerical_data_trn, 0.8)

In [8]:
X_train = numerical_data_trn.drop(labels = corr_features, axis=1)
X_test = numerical_data_tst.drop(labels = corr_features, axis=1)

X_train.shape, X_test.shape

((85741, 54), (28580, 54))

In [9]:
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

### Categorical Data Creation

In [10]:
# Separating Categorical Data

category_trn = train_data.select_dtypes(exclude=[np.number])
print('Shape of category train :', category_trn.shape)

category_tst = test_data.select_dtypes(exclude=[np.number])
print('Shape of category test :', category_tst.shape)

Shape of category train : (85741, 19)
Shape of category test : (28580, 19)


In [11]:
category_trn = category_trn.drop(columns=['x52', 'x54', 'x56'],axis=1)
category_tst = category_tst.drop(columns=['x52', 'x54', 'x56'],axis=1)

In [12]:
category_trn = category_trn.fillna(category_trn.mode().iloc[0])
category_tst = category_tst.fillna(category_trn.mode().iloc[0])

category_trn.shape, category_tst.shape

((85741, 16), (28580, 16))

In [13]:
# Creating Dummy Variable for Training Data
dummy_trn = pd.DataFrame()

for i in category_trn:
    dummy_trn = pd.concat([dummy_trn,
                           pd.get_dummies(data = category_trn[i], prefix = i,
                                          drop_first = True)], axis = 1)

In [14]:
# Creating Dummy Variable for Testing Data
dummy_tst = pd.DataFrame()

for i in category_trn:
    dummy_tst = pd.concat([dummy_tst, 
                           pd.get_dummies(data = category_tst[i], prefix = i,
                                          drop_first = True)], axis = 1)

In [15]:
if not (len(dummy_trn.columns) == len(dummy_tst.columns)):
    print(len(dummy_trn.columns), len(dummy_tst.columns))
    print('\ndummy variable length not equal' + '\n' +  'creating equal length\n')
    intesection = set(dummy_trn.columns).intersection(set(dummy_tst))
    dummy_trn = dummy_trn[intesection]
    dummy_tst = dummy_tst[intesection]
    print(len(dummy_trn.columns), len(dummy_tst.columns))
else:
    print('Train and Test Sets have equal number of dummy Variables')

299 278

dummy variable length not equal
creating equal length

272 272


In [16]:
# Concatinating Numerical And Categorical Data

concat_trn = pd.concat([X_train, dummy_trn], axis = 1 )
concat_tst = pd.concat([X_test, dummy_tst], axis = 1 )

((concat_trn.shape), (concat_tst.shape))

((85741, 326), (28580, 326))

#### Creating Model for Categorical Data

In [17]:
# Submission File
class ClassPredictLGBM:
    
    def predict_lbm(self, df_train, df_test, y):
        
        X_train, X_val, y_train, y_val = train_test_split(
            df_train, y, test_size = 0.3, random_state = 8)
        
        train_data = lgb.Dataset(X_train, label= y_train)
        val_data = lgb.Dataset(X_val, label= y_val)

        #setting parameters for lightgbm
        param = {'num_leaves':30, 'objective':'binary','max_depth':7,
                 'learning_rate':.05,'max_bin':50, 'metric':'binary_logloss',
                 'verbose': -1, 'lambda_l1' : 0.01}

        #training our model using light gbm
        num_round = 5000

        print('[Info]: Model is getting Trained ...')
        
        lgbm = lgb.train(param, train_set = train_data, num_boost_round = num_round,
                 valid_sets= val_data, verbose_eval = True, early_stopping_rounds = 100)
        
        print('[Info]: Model is predicting the Data ... \n')
        pred_prob = lgbm.predict(df_test)

        # Submission
        submission = pd.read_csv('./ID.csv')
        submission.index = submission.IDs
        submission['score'] = pred_prob.round(0)
        submission.to_csv('./shekhar_submission.csv', index=False)

        print('[Info]: Outputs are saved under shekhar_submission.csv file.')
        return


# Class and funtion calling to get prediction probabilities file.
a = ClassPredictLGBM()
a.predict_lbm(concat_trn, concat_tst, y)

[Info]: Model is getting Trained ...
[1]	valid_0's binary_logloss: 0.542391
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.536374
[3]	valid_0's binary_logloss: 0.531028
[4]	valid_0's binary_logloss: 0.526264
[5]	valid_0's binary_logloss: 0.521964
[6]	valid_0's binary_logloss: 0.518069
[7]	valid_0's binary_logloss: 0.514567
[8]	valid_0's binary_logloss: 0.51122
[9]	valid_0's binary_logloss: 0.508179
[10]	valid_0's binary_logloss: 0.505388
[11]	valid_0's binary_logloss: 0.502929
[12]	valid_0's binary_logloss: 0.500678
[13]	valid_0's binary_logloss: 0.498562
[14]	valid_0's binary_logloss: 0.496593
[15]	valid_0's binary_logloss: 0.494781
[16]	valid_0's binary_logloss: 0.493181
[17]	valid_0's binary_logloss: 0.491745
[18]	valid_0's binary_logloss: 0.490353
[19]	valid_0's binary_logloss: 0.489024
[20]	valid_0's binary_logloss: 0.487835
[21]	valid_0's binary_logloss: 0.486696
[22]	valid_0's binary_logloss: 0.485645
[23]	valid_0's binary_logloss: 

[233]	valid_0's binary_logloss: 0.468359
[234]	valid_0's binary_logloss: 0.468288
[235]	valid_0's binary_logloss: 0.468297
[236]	valid_0's binary_logloss: 0.468297
[237]	valid_0's binary_logloss: 0.468285
[238]	valid_0's binary_logloss: 0.468325
[239]	valid_0's binary_logloss: 0.468282
[240]	valid_0's binary_logloss: 0.468281
[241]	valid_0's binary_logloss: 0.468265
[242]	valid_0's binary_logloss: 0.468264
[243]	valid_0's binary_logloss: 0.468265
[244]	valid_0's binary_logloss: 0.468285
[245]	valid_0's binary_logloss: 0.468287
[246]	valid_0's binary_logloss: 0.46827
[247]	valid_0's binary_logloss: 0.468249
[248]	valid_0's binary_logloss: 0.468233
[249]	valid_0's binary_logloss: 0.468231
[250]	valid_0's binary_logloss: 0.468228
[251]	valid_0's binary_logloss: 0.468241
[252]	valid_0's binary_logloss: 0.468213
[253]	valid_0's binary_logloss: 0.468224
[254]	valid_0's binary_logloss: 0.468214
[255]	valid_0's binary_logloss: 0.468221
[256]	valid_0's binary_logloss: 0.46816
[257]	valid_0's bi

[450]	valid_0's binary_logloss: 0.468049
[451]	valid_0's binary_logloss: 0.468046
[452]	valid_0's binary_logloss: 0.468041
[453]	valid_0's binary_logloss: 0.468054
[454]	valid_0's binary_logloss: 0.468025
[455]	valid_0's binary_logloss: 0.468029
[456]	valid_0's binary_logloss: 0.468026
[457]	valid_0's binary_logloss: 0.468013
[458]	valid_0's binary_logloss: 0.468033
[459]	valid_0's binary_logloss: 0.468055
[460]	valid_0's binary_logloss: 0.468064
[461]	valid_0's binary_logloss: 0.468053
[462]	valid_0's binary_logloss: 0.468058
[463]	valid_0's binary_logloss: 0.468068
[464]	valid_0's binary_logloss: 0.468089
[465]	valid_0's binary_logloss: 0.468077
[466]	valid_0's binary_logloss: 0.468057
[467]	valid_0's binary_logloss: 0.468063
[468]	valid_0's binary_logloss: 0.468062
[469]	valid_0's binary_logloss: 0.46805
[470]	valid_0's binary_logloss: 0.468057
[471]	valid_0's binary_logloss: 0.468066
[472]	valid_0's binary_logloss: 0.468074
[473]	valid_0's binary_logloss: 0.468066
[474]	valid_0's b