In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [3]:
train = pd.read_csv("./playground-series-s4e7/train.csv")
test = pd.read_csv("./playground-series-s4e7/test.csv")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


In [5]:
train.isna().sum()

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [6]:
x_train = train.drop(columns=["Response", "id"], axis=1)
y_train = train["Response"]
x_test = test.drop(columns=["id"], axis=1)

In [7]:
x_train

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294
...,...,...,...,...,...,...,...,...,...,...
11504793,Male,48,1,6.0,0,1-2 Year,Yes,27412.0,26.0,218
11504794,Female,26,1,36.0,0,< 1 Year,Yes,29509.0,152.0,115
11504795,Female,29,1,32.0,1,< 1 Year,No,2630.0,152.0,189
11504796,Female,51,1,28.0,0,1-2 Year,Yes,48443.0,26.0,274


In [8]:
category_columns = x_train.select_dtypes(include="object").columns
category_columns = category_columns.tolist()

def label_encoding(df):
    # Instance of LabelEncoder
    label_encoders = {col: LabelEncoder() for col in category_columns}

    # Label Encoding
    for col in category_columns:
    
        df[col] = label_encoders[col].fit_transform(df[col])

label_encoding(x_train)
label_encoding(x_test)

In [9]:
X_train, X_val, y_train, y_val= train_test_split(x_train,y_train, test_size = 0.2, random_state=42)

In [10]:
y_train.value_counts()/y_train.shape[0]

0    0.877021
1    0.122979
Name: Response, dtype: float64

In [12]:
y_val.value_counts()/y_val.shape[0]

0    0.876928
1    0.123072
Name: Response, dtype: float64

In [45]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_val_std =scaler.transform(X_val)
X_test_std = scaler.transform(x_test)

In [46]:
print('X_train shape : ',X_train_std.shape)
print('X_val shape : ',X_val_std.shape)
print('y_train shape : ',y_train.shape)
print('y_val shape : ',y_val.shape)
print('X_test shape : ',X_test_std.shape)

X_train shape :  (9203838, 10)
X_val shape :  (2300960, 10)
y_train shape :  (9203838,)
y_val shape :  (2300960,)
X_test shape :  (7669866, 10)


In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# Define the parameter space
space = {
    'criterion': hp.choice('criterion', ['entropy', 'gini']),
    'max_depth': hp.quniform('max_depth', 10, 12, 1),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
    'min_samples_split': hp.uniform('min_samples_split', 0, 1),
    'n_estimators': hp.choice('n_estimators', [10, 50]),
    'class_weight': hp.choice('class_weight', [None, 'balanced'])
}

def objective(space):
    hopt = RandomForestClassifier(
        criterion=space['criterion'], 
        max_depth=int(space['max_depth']),  # Convert to int
        max_features=space['max_features'],
        min_samples_leaf=space['min_samples_leaf'],
        min_samples_split=space['min_samples_split'],
        n_estimators=space['n_estimators'],
        class_weight=space['class_weight']
    )
    
    # Use cross_val_predict to get predicted probabilities
    y_pred_prob = cross_val_predict(hopt, X_train[:20000], y_train[], cv=4, method='predict_proba')[:, 1]
    
    # Calculate the AUC
    auc = roc_auc_score(y_train, y_pred_prob)
    
    # Since we want to maximize AUC, we minimize the negative AUC
    return {'loss': -auc, 'status': STATUS_OK}
    
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20,
            trials=trials)

print(best)


  0%|                                                                           | 0/20 [00:09<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [None]:
best

In [17]:
from sklearn.metrics import cl

In [18]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 75, 3: 100, 4: 125}

trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], 
                                      max_depth = int(best['max_depth']), 
                                      max_features = feat[best['max_features']], 
                                      min_samples_leaf = best['min_samples_leaf'], 
                                      min_samples_split = best['min_samples_split'], 
                                      n_estimators = est[best['n_estimators']]
                                     ).fit(X_train,y_train)

In [22]:
y_val_pred = trainedforest.predict(X_val)

In [23]:
hopt_acc = accuracy_score(y_val,y_val_pred)
print(hopt_acc)

0.8769278909672484


In [24]:
a=trainedforest.predict(x_test)

In [34]:
submit = pd.read_csv("./playground-series-s4e7/sample_submission.csv")
pred = a 
submit["Response"] = pred
submit.to_csv("submission.csv", index=False)
submit.head()

Unnamed: 0,id,Response
0,11504798,0
1,11504799,0
2,11504800,0
3,11504801,0
4,11504802,0
