In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc,os,re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
def load_data():
    train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
    test  = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
    submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
    return train,test,submission

In [None]:
train,_,_ = load_data()
train.head(1)

In [None]:
train['target'].value_counts()

In [None]:
cat_features=[]

for c in train.columns:
    if train[c].dtype=='object':
        cat_features.append(c)
print(cat_features)

In [None]:
train_encoded = pd.get_dummies(train,columns=cat_features,drop_first=True)
train_encoded

In [None]:
from sklearn.feature_selection import mutual_info_classif

def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

train_data = train_encoded.copy()
train_data.drop(["id"],axis=1,inplace=True)
y=train_data.pop('target')
X=train_data
scores = make_mi_scores(X,y)
scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    color = np.array(["C0"] * scores.shape[0])
    # Color red for probes
    idx = [i for i, col in enumerate(scores.index)
           if col.startswith("PROBE")]
    color[idx] = "C3"
    # Create plot
    plt.figure(figsize=(50,50))
    plt.barh(width, scores, color=color)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores\n")
    
plot_mi_scores(scores)

In [None]:
print("selected features:",len(scores[scores>0.01]))

In [None]:
select_features = list(scores[scores>0.01].index)
print("selected important festures:\n",select_features)

In [None]:
train2 = train_encoded[select_features]
X= train2
y=train.pop('target')

In [None]:
from sklearn.model_selection import train_test_split

train_x,test_x,train_y,test_y = train_test_split(X,y,train_size=0.8)
print(train_x.shape,train_y.shape)

In [None]:
print(test_x.shape,test_y.shape)

In [None]:
gc.collect()

### Baseline model - Random Forest Classifier

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier

model1 = RandomForestClassifier()
model1.fit(train_x,train_y)

In [None]:
from sklearn.metrics import accuracy_score

print("Random Forest Model Accuracy",round(accuracy_score(test_y,model1.predict(test_x)),5))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_y,model1.predict(test_x)))

In [None]:
gc.collect()

### CatBoost Classifier

In [None]:
!pip install catboost -q
from catboost import CatBoostClassifier, Pool, cv

In [None]:
%%time

model2 = CatBoostClassifier(custom_loss=['Accuracy'],logging_level='Silent')
model2.fit(train_x,train_y,eval_set=(test_x,test_y),logging_level='Verbose',plot=True)

In [None]:
cv_params = model2.get_params()
cv_params.update({'loss_function': 'Logloss'})

cv_data = cv(Pool(X, y),cv_params,plot=True)

In [None]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

In [None]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

In [None]:
print(confusion_matrix(test_y,model2.predict(test_x)))

In [None]:
print("Catboost Model Accuracy",round(accuracy_score(test_y,model2.predict(test_x)),5))

In [None]:
gc.collect()

### Hyper Parameter Tuning

In [None]:
%%time

!pip install hyperopt -q

In [None]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=1000,
        eval_metric='Accuracy',
        random_seed=41,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [None]:
from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=3,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

best = {'l2_leaf_reg': 3.0, 'learning_rate': 0.16129990013229004}

In [None]:
best = {'l2_leaf_reg': 3.0, 'learning_rate': 0.16129990013229004}

model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=True,
    loss_function='Logloss',
)

cv_data = cv(Pool(X, y), model.get_params())

In [None]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

In [None]:
model.fit(X, y)

In [None]:
_,test,_ = load_data()
test.head(2)

In [None]:
test_encoded = pd.get_dummies(test,columns=cat_features,drop_first=True)
test_encoded.head(2)

### catboost without tuning hyper parameters

In [None]:
_,_,submission = load_data()
submission['target'] = model2.predict(test_encoded[select_features])
submission.to_csv('submission.csv', index=False)

### catboost with tuning hyper parameters

In [None]:
_,_,submission = load_data()
submission['target'] = model.predict(test_encoded[select_features])
submission.to_csv('submission.csv', index=False)