In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [None]:
cd_train=pd.read_csv("/content/carvan_train.csv")
cd_test=pd.read_csv("/content/carvan_test.csv")

In [None]:
target="V86"


In [None]:
x_train=cd_train.drop(target,axis=1)
y_train=cd_train[target]

In [None]:
#Identical categorical features from the data dictionary
categorical_features=[col for col in x_train.columns if "L0" in col or "L2" in col]

In [None]:
x_train=pd.get_dummies(x_train,columns=categorical_features,drop_first=True)
x_test=pd.get_dummies(cd_test,columns=categorical_features,drop_first=True)

In [None]:
x_train,x_test=x_train.align(x_test,join="inner",axis=1)

In [None]:
smote=SMOTE(random_state=42)
x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)

In [None]:
scaler=StandardScaler()
x_train_smote=scaler.fit_transform(x_train_smote)
x_test_scaled=scaler.transform(x_test)


In [None]:
rf=RandomForestClassifier(random_state=42)


In [None]:
param_grid={
    'n_estimators':[100,200,300],
    'max_depth':[5,10,15,None],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4],
    'class_weight':["balances",None]
}

In [None]:
random_search = RandomizedSearchCV(
    rf, param_distributions=param_grid,
    scoring='roc_auc', n_iter=50, cv=5, verbose=2,random_state=42,n_jobs=-1
)
random_search.fit(x_train_smote,y_train_smote)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [None]:
best_model=random_search.best_estimator_

In [None]:
train_probs=best_model.predict_proba(x_train_smote)[:,-1]
cutoffs=np.linspace(0.001,0.999,999)
fbetas=[fbeta_score(y_train_smote,(train_probs > cutoff).astype(int),beta=2) for cutoff in cutoffs]
optimal_cutoff=cutoffs[np.argmax(fbetas)]

In [None]:
test_probs=best_model.predict_proba(x_test_scaled)[:,-1]
predictions=(test_probs > optimal_cutoff).astype(int)


In [None]:
submission=pd.DataFrame({'V86':predictions})
submission.to_csv('sample_submission.csv',index=False)


In [None]:
print("optimal Cutoff:",optimal_cutoff)
print("Besr f-beta Score on Train:",max(fbetas))

optimal Cutoff: 0.355
Besr f-beta Score on Train: 0.9892484835276597
