## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train = train_data.copy()
test = test_data.copy()

In [4]:
def preprocessing(data):
    
    #Dropping null values
    data = data.dropna()
    
    #Dropping id
    data = data.drop('id', axis = 1)
    
    #Columns to get dummies
    cols = ['Gender', 'Vehicle_Damage', 'Vehicle_Age']
    
    #Changing categories into dummies
    data_dum = pd.get_dummies(data = data , columns = cols, drop_first = True )
    
    #We don't need this column as it has almost no correlation with our dependent variable
    data_dum = data_dum.drop('Vintage', axis = 1)
    
    return data_dum

In [5]:
#Preprocessing training data
train_dum = preprocessing(train)

#Preprocessing test data
test_dum = preprocessing(test)

### Feature Selection

We will do a chi-square test and remove all the variables columns with a significance score of less than 0.5.

**The Chi-Square Test of Independence determines whether there is an association between categorical variables (i.e., whether the variables are independent or related). It is a nonparametric test.**

Formula for chi-square:

![chi2.png](attachment:chi2.png)

In [6]:
from sklearn.feature_selection import SelectKBest, chi2

x = train_dum.drop('Response', axis = 1)
y = train_dum['Response']

best_feature = SelectKBest(score_func= chi2, k = 'all')
best_feature = best_feature.fit(x,y)

col_scores = pd.DataFrame((best_feature.scores_).round(2))
col_names = pd.DataFrame(x.columns)

feature_score = pd.concat([col_names, col_scores], axis=1)
feature_score.columns = ['attribute', 'score']
feature_score

Unnamed: 0,attribute,score
0,Age,29179.11
1,Driving_License,0.08
2,Region_Code,282.41
3,Previously_Insured,24033.83
4,Annual_Premium,1882769.28
5,Policy_Sales_Channel,193218.44
6,Gender_Male,481.3
7,Vehicle_Damage_Yes,23700.04
8,Vehicle_Age_< 1 Year,9528.78
9,Vehicle_Age_> 2 Years,4361.7


In [7]:
"""
First getting the attributes whose scores are less than 0.5
Then extracting that column name passing into a variable.

"""
cols_to_omit = (feature_score[feature_score['score']<0.5].values)[:,0]

In [8]:
train_dum = train_dum.drop(cols_to_omit, axis = 1)
test_dum = test_dum.drop(cols_to_omit, axis = 1)

In [9]:
train_dum.shape, test_dum.shape

((381109, 10), (127037, 9))

In [10]:
#Assigning inputs and targets

inputs = train_dum.drop('Response', axis = 1)
targets = train_dum['Response'] 

x_test = test_dum.copy()

#### We will use stratified K-Fold from sklearn as this is a highly imbalanced dataset

**Stratified K-Folds cross-validator. Provides train/test indices to split data in train/test sets. This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class. Parameters n_splitsint, default=5.**

In [11]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 10, random_state = 42)

for train_idx, val_idx in skf.split(inputs, targets):
    x_train, x_val = inputs.iloc[train_idx], inputs.iloc[val_idx]
    y_train, y_val = targets.iloc[train_idx], targets.iloc[val_idx]

In [12]:
#Scaling all input data
    
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

x_train_scaled = scaler.fit_transform(x_train)

#Will use only transform for validation and test data as we don't want any data leakage
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

In [15]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

lgbm = LGBMClassifier(num_leaves = 30, max_depth = 5, n_estimators = 550, learning_rate = 0.05, objective = 'binary', 
                      lambda_l2 = 20,
                      max_bin = 100, metric = 'auc', is_unbalance = True, random_state = None, n_jobs = -1)
lgbm.fit(x_train_scaled,y_train)
y_val_pred = lgbm.predict_proba(x_val_scaled)[:,1]
print(roc_auc_score(y_val, y_val_pred))

0.8549038725147512


In [14]:
vdvdf

NameError: name 'vdvdf' is not defined

In [16]:
y_pred = lgbm.predict_proba(x_test_scaled)[:,1]

In [17]:
my_submission = pd.DataFrame({'id': test.id, 'Response': y_pred})
# you could use any filename. We choose submission here
my_submission.to_csv('submission_lgbm.csv', index=False)


In [18]:
my_submission['Response'].shape

(127037,)