In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import optuna
import pickle

import warnings

warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv('../input/training-tabular-apr-2021/train2.csv')
test=pd.read_csv('../input/training-tabular-apr-2021/test2.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Embarked','Sex'], inplace=True)
test.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Embarked','Sex'], inplace=True)

In [None]:
train=train.astype(float)
test=test.astype(float)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
features=train.columns[1:]
X = train[features]
y = train['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=39)

In [None]:
len(x_train), len(y_train), len(x_test), len(y_test)

In [None]:
folds=3
SEED=5661
kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
score=pd.DataFrame()

In [None]:
def objective(trial, x=x_train, y=y_train):
    para={
        'n_estimators': trial.suggest_int('n_estimators',10, 1000),
        'criterion': 'entropy',
        'max_depth': trial.suggest_int('max_depth', 3, 200),
        'min_samples_split': trial.suggest_float('min_samples_split', 1e-4, 1e-1),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 1e-4, 1e-1),
        'max_features': trial.suggest_categorical("max_features", ['sqrt', 'log2']),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 8, 10000),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0, 0.1),
        'bootstrap': True,
        'n_jobs': -1,
        'verbose': True,
        'class_weight':trial.suggest_categorical("class_weight", ['balanced', 'balanced_subsample']),
        'max_samples': trial.suggest_float('max_samples', 0.5, 1),
        'random_state': SEED
    }
    
    rf_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        rf=RandomForestClassifier(**para)

        model =  rf.fit(xtrain, ytrain)
        pred_train = model.predict_proba(xtrain)[:,1]
        pred_val = model.predict_proba(xval)[:,1]
        rf_train_preds[val_ind]=pred_val
        score1 = roc_auc_score(ytrain, pred_train)
        score2 = roc_auc_score(yval, pred_val)
        print('Fold {} AUC Train: {} Validation: {}'.format(fold+1, score1, score2))
    
    auc=roc_auc_score(y, rf_train_preds)
    print('OOF AUC: {}'.format(auc))
    return auc

In [None]:
study=optuna.create_study(study_name=f"optimization", direction='maximize')
study.optimize(objective, n_trials=20)

In [None]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

In [None]:
print("Best hyperparameters: {}".format(trial.params))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
print(study.trials_dataframe())
study.trials_dataframe().to_csv("trial_parameters.csv", index=False)
# with open('./study.pickle', 'wb') as f:
#     pickle.dump(study, f)

In [None]:
para={
      'n_estimators': 898,
      'criterion': 'entropy',
      'max_depth': 8, 
      'max_leaf_nodes': 9058,
      'min_samples_split': 0.0671810090247945, 
      'min_samples_leaf': 0.04742472303688006, 
      'max_features': 'sqrt', 
      'min_impurity_decrease': 0.00010583321874846287,
      'bootstrap': True,
      'n_jobs': -1,
      'verbose': True, 
      'class_weight': 'balanced_subsample', 
      'max_samples': 0.8634669615516827,
      'random_state': SEED
     }

In [None]:
folds=5
SEED=5661
kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
score=pd.DataFrame()

In [None]:
para={
      'n_estimators': 898,
      'criterion': 'entropy',
      'max_depth': 8, 
      'max_leaf_nodes': 9058,
      'min_samples_split': 0.0671810090247945, 
      'min_samples_leaf': 0.04742472303688006, 
      'max_features': 'sqrt', 
      'min_impurity_decrease': 0.00010583321874846287,
      'bootstrap': True,
      'n_jobs': -1,
      'verbose': True, 
      'class_weight': 'balanced_subsample', 
      'max_samples': 0.8634669615516827,
      'random_state': SEED
}

In [None]:
rf_train_preds = np.zeros(len(y_train),)
rf_test_preds = np.zeros(len(y_test), )
rf_TEST_preds = np.zeros(len(test), )
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    rf=RandomForestClassifier(**para)

    model =  rf.fit(xtrain, ytrain)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    pred_test = model.predict_proba(x_test)[:,1]
    pred_TEST = model.predict_proba(test)[:,1]
    rf_train_preds[val_ind]=pred_val
    rf_test_preds+= pred_test/folds
    rf_TEST_preds+= pred_TEST/folds
    score1 = roc_auc_score(ytrain, pred_train)
    score2 = roc_auc_score(yval, pred_val)
    score3 = roc_auc_score(y_test, pred_test)
    print('Fold {} AUC Train: {:.2f} Validation: {:.2f}'.format(fold+1, score1, score2))


In [None]:
print('OOF AUC Train: {:.2f} Test: {:.2f}'.format(roc_auc_score(y_train, rf_train_preds), roc_auc_score(y_test, rf_test_preds)))

score['rf'] = rf_TEST_preds

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['rf']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./rf_tuned.csv',index=False)