In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e7/sample_submission.csv
/kaggle/input/playground-series-s4e7/train.csv
/kaggle/input/playground-series-s4e7/test.csv


### Getting aware of data

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e7/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv")

In [3]:
train = pd.get_dummies(train, columns=['Vehicle_Age', 'Gender', 'Vehicle_Damage'], drop_first=True, dtype=int)
test = pd.get_dummies(test, columns=['Vehicle_Age', 'Gender', 'Vehicle_Damage'], drop_first=True, dtype=int)

In [4]:
train['Age'] = train['Age'].astype('category')
test['Age'] = test['Age'].astype('category')

# List of columns to be converted to 'category' type
category_columns = ['Driving_License', 'Vehicle_Age_< 1 Year', 'Vehicle_Age_> 2 Years', 
                    'Region_Code', 'Vehicle_Damage_Yes', 'Gender_Male']

for col in category_columns:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [5]:
train.dtypes

id                          int64
Age                      category
Driving_License          category
Region_Code              category
Previously_Insured          int64
Annual_Premium            float64
Policy_Sales_Channel      float64
Vintage                     int64
Response                    int64
Vehicle_Age_< 1 Year     category
Vehicle_Age_> 2 Years    category
Gender_Male              category
Vehicle_Damage_Yes       category
dtype: object

In [6]:
# dictionary to map old column names to new column names
column_mapping = {
    'Vehicle_Age_< 1 Year': 'Vehicle_Age_less_than 1 Year',
    'Vehicle_Age_> 2 Years': 'Vehicle_Age_great_than 2 Year'
}

for old_col, new_col in column_mapping.items():
    train[new_col] = train[old_col]
    test[new_col] = test[old_col]

In [7]:
train_x = train.drop(columns=['id', 'Response', 'Vehicle_Age_< 1 Year', 'Vehicle_Age_> 2 Years'])
train_y = train.Response

test_orgl = test.drop(columns=['id', 'Vehicle_Age_< 1 Year', 'Vehicle_Age_> 2 Years'])

In [8]:
train_x

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender_Male,Vehicle_Damage_Yes,Vehicle_Age_less_than 1 Year,Vehicle_Age_great_than 2 Year
0,21,1,35.0,0,65101.0,124.0,187,1,1,0,0
1,43,1,28.0,0,58911.0,26.0,288,1,1,0,1
2,25,1,14.0,1,38043.0,152.0,254,0,0,1,0
3,35,1,1.0,0,2630.0,156.0,76,0,1,0,0
4,36,1,15.0,1,31951.0,152.0,294,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
11504793,48,1,6.0,0,27412.0,26.0,218,1,1,0,0
11504794,26,1,36.0,0,29509.0,152.0,115,0,1,1,0
11504795,29,1,32.0,1,2630.0,152.0,189,0,0,1,0
11504796,51,1,28.0,0,48443.0,26.0,274,0,1,0,0


#### importing relevent library

In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [10]:
train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [11]:
scale_pos_weight = train_y.value_counts()[0] / train_y.value_counts()[1]

In [12]:
xgbc = XGBClassifier(scale_pos_weight = scale_pos_weight, enable_categorical=True, device='cuda')
xgbc.fit(train_x, train_y)



In [13]:
test_pred = xgbc.predict_proba(test_x)[:, 1]

In [14]:
test_pred

array([8.7148160e-01, 4.7253501e-01, 7.4819082e-01, ..., 4.5531220e-04,
       1.6281186e-03, 6.6928381e-01], dtype=float32)

In [15]:
# accuracy_score(test_y, test_pred)

In [16]:
roc_auc_score(test_y, test_pred)

0.8784851520461185

In [17]:
y_pred = xgbc.predict_proba(test_orgl)[:, 1]
print(y_pred)

[0.0403898  0.8364237  0.7538595  ... 0.00206943 0.91056436 0.00169365]


In [18]:
temp = pd.DataFrame(columns=['id', 'Response'])
temp['id'] = test.id
temp['Response'] = y_pred
temp

Unnamed: 0,id,Response
0,11504798,0.040390
1,11504799,0.836424
2,11504800,0.753860
3,11504801,0.000845
4,11504802,0.255106
...,...,...
7669861,19174659,0.627088
7669862,19174660,0.000852
7669863,19174661,0.002069
7669864,19174662,0.910564


In [19]:
temp.to_csv('submission.csv', index=False)