In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [27]:
train = pd.read_csv(r"D:\Data Science Course\Projects\Analytics vidya\Cross Sales Prediction\train.csv")
test = pd.read_csv(r"D:\Data Science Course\Projects\Analytics vidya\Cross Sales Prediction\test.csv")
sample_sub = pd.read_csv(r"D:\Data Science Course\Projects\Analytics vidya\Cross Sales Prediction\sample_submission.csv")

In [28]:
print('Shape of train: {}'.format(train.shape))
print('Shape of test: {}'.format(test.shape))

Shape of train: (381109, 12)
Shape of test: (127037, 11)


---
# Data Merging

In [29]:
train['is_train'] = 1
test['is_train'] = 0
test['Response'] = None

In [30]:
data = pd.concat((train,test))
data.set_index('id',inplace=True)

In [31]:
data.shape

(508146, 12)

---
# Machine Learning Pipeline

### Missing Values

In [32]:
data.isna().sum()/data.shape[0]*100

Gender                   0.000000
Age                      0.000000
Driving_License          0.000000
Region_Code              0.000000
Previously_Insured       0.000000
Vehicle_Age              0.000000
Vehicle_Damage           0.000000
Annual_Premium           0.000000
Policy_Sales_Channel     0.000000
Vintage                  0.000000
Response                25.000098
is_train                 0.000000
dtype: float64

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 508146 entries, 1 to 508146
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                508146 non-null  object 
 1   Age                   508146 non-null  int64  
 2   Driving_License       508146 non-null  int64  
 3   Region_Code           508146 non-null  float64
 4   Previously_Insured    508146 non-null  int64  
 5   Vehicle_Age           508146 non-null  object 
 6   Vehicle_Damage        508146 non-null  object 
 7   Annual_Premium        508146 non-null  float64
 8   Policy_Sales_Channel  508146 non-null  float64
 9   Vintage               508146 non-null  int64  
 10  Response              381109 non-null  object 
 11  is_train              508146 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 50.4+ MB


In [34]:
data['Region_Code']=data['Region_Code'].astype(int)
data['Policy_Sales_Channel']=data['Policy_Sales_Channel'].astype(int)

---
# Encoding

### Label Encoding

In [35]:
label_cols = ['Gender','Vehicle_Age','Vehicle_Damage']

In [36]:
data[label_cols] = data[label_cols].apply(le.fit_transform)

In [37]:
features = ['Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage']

In [38]:
cat_col = ['Gender', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage',
       'Policy_Sales_Channel']

---
# Train Test Split for Validation

In [39]:
train_df = data[ data['is_train'] == 1].copy()
test_df = data[ data['is_train'] == 0].copy()

In [40]:
train_df.drop('is_train', axis=1,inplace=True)
test_df.drop('is_train', axis=1,inplace=True)
test_df.drop('Response', axis=1,inplace=True)

In [41]:
train_df['Response'] = train_df['Response'].astype(int)
train_df = train_df[ [col for col in train_df if col!='Response'] + ['Response']]

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.iloc[:,:-1], train_df['Response'], test_size = 0.30,random_state = 1999)

---
# Validation

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,roc_auc_score

## Model Dict

In [50]:
model_dict = {}

model_dict['LogisticRegression'] = LogisticRegression(max_iter = 200, solver='saga')
model_dict['DecisionTreeClassifier'] = DecisionTreeClassifier()
model_dict['RandomForestClassifier'] = RandomForestClassifier()
model_dict['AdaBoostClassifier'] = AdaBoostClassifier()
model_dict['BaggingClassifier'] = BaggingClassifier()
model_dict['XGBoost'] = XGBClassifier()
model_dict['LGBM'] = LGBMClassifier()
model_dict['Cat'] = CatBoostClassifier(verbose=False)

In [45]:
def model_test(X_train, X_test, y_train, y_test,model,model_name):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    proba1 = model.predict_proba(X_test)[:,1]
    print('======================================{}======================================='.format(model_name))
    print('Accuracy is : {}'.format(accuracy))
    print('ROC_AUC_SCORE is:{}'.format(roc_auc_score(y_test,proba1)))
    print()

In [46]:
for model_name,model in model_dict.items():
    model_test(X_train, X_test, y_train, y_test, model, model_name)



Accuracy is : 0.8770083877795563
ROC_AUC_SCORE is:0.5173841540541616

Accuracy is : 0.8227021070032274
ROC_AUC_SCORE is:0.5978041514323094

Accuracy is : 0.8657780343382925
ROC_AUC_SCORE is:0.8340700176423188

Accuracy is : 0.8769996413983714
ROC_AUC_SCORE is:0.8520878852813213

Accuracy is : 0.8608538217312587
ROC_AUC_SCORE is:0.7919947062930374

Accuracy is : 0.876326170047143
ROC_AUC_SCORE is:0.8562897959983147

Accuracy is : 0.8768247137746757
ROC_AUC_SCORE is:0.8575079150351076

Learning rate set to 0.111918
0:	learn: 0.5128497	total: 35.5ms	remaining: 35.4s
1:	learn: 0.4337789	total: 66.7ms	remaining: 33.3s
2:	learn: 0.3701818	total: 97.3ms	remaining: 32.3s
3:	learn: 0.3353979	total: 126ms	remaining: 31.5s
4:	learn: 0.3143692	total: 159ms	remaining: 31.6s
5:	learn: 0.3013165	total: 189ms	remaining: 31.4s
6:	learn: 0.2932312	total: 222ms	remaining: 31.5s
7:	learn: 0.2877139	total: 252ms	remaining: 31.2s
8:	learn: 0.2835222	total: 281ms	remaining: 31s
9:	learn: 0.2803415	total: 310

144:	learn: 0.2627589	total: 4.29s	remaining: 25.3s
145:	learn: 0.2627350	total: 4.32s	remaining: 25.2s
146:	learn: 0.2627009	total: 4.35s	remaining: 25.2s
147:	learn: 0.2626718	total: 4.38s	remaining: 25.2s
148:	learn: 0.2626514	total: 4.41s	remaining: 25.2s
149:	learn: 0.2626228	total: 4.44s	remaining: 25.2s
150:	learn: 0.2625982	total: 4.47s	remaining: 25.2s
151:	learn: 0.2625755	total: 4.51s	remaining: 25.1s
152:	learn: 0.2625516	total: 4.53s	remaining: 25.1s
153:	learn: 0.2625268	total: 4.57s	remaining: 25.1s
154:	learn: 0.2625018	total: 4.6s	remaining: 25.1s
155:	learn: 0.2624776	total: 4.63s	remaining: 25s
156:	learn: 0.2624507	total: 4.66s	remaining: 25s
157:	learn: 0.2624243	total: 4.69s	remaining: 25s
158:	learn: 0.2623957	total: 4.72s	remaining: 25s
159:	learn: 0.2623758	total: 4.75s	remaining: 24.9s
160:	learn: 0.2623545	total: 4.77s	remaining: 24.9s
161:	learn: 0.2623287	total: 4.8s	remaining: 24.8s
162:	learn: 0.2623091	total: 4.83s	remaining: 24.8s
163:	learn: 0.2622849	

308:	learn: 0.2593646	total: 9.16s	remaining: 20.5s
309:	learn: 0.2593527	total: 9.19s	remaining: 20.4s
310:	learn: 0.2593288	total: 9.22s	remaining: 20.4s
311:	learn: 0.2593085	total: 9.24s	remaining: 20.4s
312:	learn: 0.2592935	total: 9.28s	remaining: 20.4s
313:	learn: 0.2592736	total: 9.31s	remaining: 20.3s
314:	learn: 0.2592492	total: 9.34s	remaining: 20.3s
315:	learn: 0.2592295	total: 9.37s	remaining: 20.3s
316:	learn: 0.2592119	total: 9.4s	remaining: 20.3s
317:	learn: 0.2591953	total: 9.43s	remaining: 20.2s
318:	learn: 0.2591667	total: 9.46s	remaining: 20.2s
319:	learn: 0.2591426	total: 9.49s	remaining: 20.2s
320:	learn: 0.2591254	total: 9.53s	remaining: 20.2s
321:	learn: 0.2591036	total: 9.56s	remaining: 20.1s
322:	learn: 0.2590835	total: 9.59s	remaining: 20.1s
323:	learn: 0.2590712	total: 9.62s	remaining: 20.1s
324:	learn: 0.2590566	total: 9.65s	remaining: 20s
325:	learn: 0.2590330	total: 9.68s	remaining: 20s
326:	learn: 0.2590232	total: 9.7s	remaining: 20s
327:	learn: 0.259008

471:	learn: 0.2567009	total: 14.5s	remaining: 16.2s
472:	learn: 0.2566939	total: 14.5s	remaining: 16.2s
473:	learn: 0.2566780	total: 14.6s	remaining: 16.2s
474:	learn: 0.2566665	total: 14.6s	remaining: 16.1s
475:	learn: 0.2566491	total: 14.6s	remaining: 16.1s
476:	learn: 0.2566366	total: 14.7s	remaining: 16.1s
477:	learn: 0.2566213	total: 14.7s	remaining: 16.1s
478:	learn: 0.2566099	total: 14.8s	remaining: 16.1s
479:	learn: 0.2565918	total: 14.8s	remaining: 16s
480:	learn: 0.2565781	total: 14.8s	remaining: 16s
481:	learn: 0.2565638	total: 14.9s	remaining: 16s
482:	learn: 0.2565531	total: 14.9s	remaining: 16s
483:	learn: 0.2565361	total: 15s	remaining: 15.9s
484:	learn: 0.2565188	total: 15s	remaining: 15.9s
485:	learn: 0.2565025	total: 15s	remaining: 15.9s
486:	learn: 0.2564832	total: 15.1s	remaining: 15.9s
487:	learn: 0.2564606	total: 15.1s	remaining: 15.8s
488:	learn: 0.2564469	total: 15.1s	remaining: 15.8s
489:	learn: 0.2564270	total: 15.2s	remaining: 15.8s
490:	learn: 0.2564157	tota

634:	learn: 0.2543659	total: 20.4s	remaining: 11.7s
635:	learn: 0.2543532	total: 20.5s	remaining: 11.7s
636:	learn: 0.2543390	total: 20.5s	remaining: 11.7s
637:	learn: 0.2543245	total: 20.5s	remaining: 11.6s
638:	learn: 0.2543124	total: 20.6s	remaining: 11.6s
639:	learn: 0.2542984	total: 20.6s	remaining: 11.6s
640:	learn: 0.2542850	total: 20.6s	remaining: 11.5s
641:	learn: 0.2542752	total: 20.6s	remaining: 11.5s
642:	learn: 0.2542643	total: 20.7s	remaining: 11.5s
643:	learn: 0.2542446	total: 20.7s	remaining: 11.4s
644:	learn: 0.2542279	total: 20.7s	remaining: 11.4s
645:	learn: 0.2542172	total: 20.8s	remaining: 11.4s
646:	learn: 0.2542095	total: 20.8s	remaining: 11.4s
647:	learn: 0.2541983	total: 20.8s	remaining: 11.3s
648:	learn: 0.2541838	total: 20.9s	remaining: 11.3s
649:	learn: 0.2541695	total: 20.9s	remaining: 11.3s
650:	learn: 0.2541619	total: 20.9s	remaining: 11.2s
651:	learn: 0.2541478	total: 21s	remaining: 11.2s
652:	learn: 0.2541345	total: 21s	remaining: 11.2s
653:	learn: 0.25

796:	learn: 0.2521605	total: 25.7s	remaining: 6.54s
797:	learn: 0.2521371	total: 25.7s	remaining: 6.5s
798:	learn: 0.2521242	total: 25.7s	remaining: 6.47s
799:	learn: 0.2521071	total: 25.8s	remaining: 6.44s
800:	learn: 0.2520925	total: 25.8s	remaining: 6.41s
801:	learn: 0.2520816	total: 25.8s	remaining: 6.38s
802:	learn: 0.2520681	total: 25.9s	remaining: 6.35s
803:	learn: 0.2520564	total: 25.9s	remaining: 6.32s
804:	learn: 0.2520351	total: 25.9s	remaining: 6.28s
805:	learn: 0.2520237	total: 26s	remaining: 6.25s
806:	learn: 0.2520074	total: 26s	remaining: 6.22s
807:	learn: 0.2519960	total: 26s	remaining: 6.19s
808:	learn: 0.2519839	total: 26.1s	remaining: 6.16s
809:	learn: 0.2519731	total: 26.1s	remaining: 6.13s
810:	learn: 0.2519614	total: 26.1s	remaining: 6.09s
811:	learn: 0.2519526	total: 26.2s	remaining: 6.06s
812:	learn: 0.2519422	total: 26.2s	remaining: 6.03s
813:	learn: 0.2519285	total: 26.2s	remaining: 6s
814:	learn: 0.2519081	total: 26.3s	remaining: 5.96s
815:	learn: 0.2518945	

955:	learn: 0.2501145	total: 31.1s	remaining: 1.43s
956:	learn: 0.2501028	total: 31.1s	remaining: 1.4s
957:	learn: 0.2500917	total: 31.1s	remaining: 1.36s
958:	learn: 0.2500788	total: 31.2s	remaining: 1.33s
959:	learn: 0.2500599	total: 31.2s	remaining: 1.3s
960:	learn: 0.2500450	total: 31.3s	remaining: 1.27s
961:	learn: 0.2500263	total: 31.3s	remaining: 1.24s
962:	learn: 0.2500154	total: 31.3s	remaining: 1.2s
963:	learn: 0.2499987	total: 31.4s	remaining: 1.17s
964:	learn: 0.2499903	total: 31.4s	remaining: 1.14s
965:	learn: 0.2499769	total: 31.4s	remaining: 1.11s
966:	learn: 0.2499640	total: 31.5s	remaining: 1.07s
967:	learn: 0.2499510	total: 31.5s	remaining: 1.04s
968:	learn: 0.2499357	total: 31.5s	remaining: 1.01s
969:	learn: 0.2499179	total: 31.6s	remaining: 976ms
970:	learn: 0.2499052	total: 31.6s	remaining: 944ms
971:	learn: 0.2498930	total: 31.6s	remaining: 911ms
972:	learn: 0.2498802	total: 31.7s	remaining: 879ms
973:	learn: 0.2498666	total: 31.7s	remaining: 846ms
974:	learn: 0.2

---
# Final Prediction and Exporting Submission CSV

In [51]:
X_train = train_df.iloc[:,:-1]
y_train = train_df['Response']
X_test = test_df

In [52]:
def final_pred_with_csv(X_train, X_test, y_train,model,model_name,sample_sub):
    model.fit(X_train, y_train)
    proba1 = model.predict_proba(X_test)[:,1]
    submit = sample_sub.copy()
    submit['Response'] = proba1
    address = "D:\\Data Science Course\\Projects\\Analytics vidya\\Cross Sales Prediction\\" + model_name + '.csv'
    submit.to_csv(address, index=False)
    print(model_name, 'Done')

In [53]:
for model_name,model in model_dict.items():
    final_pred_with_csv(X_train, X_test, y_train,model, model_name,sample_sub)



LogisticRegression Done
DecisionTreeClassifier Done
RandomForestClassifier Done
AdaBoostClassifier Done
BaggingClassifier Done
XGBoost Done
LGBM Done
Cat Done
