In [None]:
! pip install catboost


In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score

In [None]:
# reading our data 
# Load files
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
samplesubmission = pd.read_csv('sample_submission.csv')
variable_definations = pd.read_csv('VariableDescription.csv')

In [None]:
##checking for missing values and treat.
# Missing Value Imputation 
print(train.isnull().sum())

print("\n")
v=""
for i in range(1,5):
    v += "***"
print(v ,end="")
print("\n")

print(test.isnull().sum())

In [None]:
train['Garden'].fillna(train['Garden'].mode()[0],inplace=True)
test['Garden'].fillna(test['Garden'].mode()[0],inplace=True)

train['Building Dimension'].fillna(train['Building Dimension'].median(),inplace=True)
test['Building Dimension'].fillna(test['Building Dimension'].median(),inplace=True)

train['Date_of_Occupancy'].fillna(train['Date_of_Occupancy'].mean(),inplace=True)
test['Date_of_Occupancy'].fillna(test['Date_of_Occupancy'].mean(),inplace=True)

train['Geo_Code'].fillna(train['Geo_Code'].mode()[0],inplace=True)
test['Geo_Code'].fillna(test['Geo_Code'].mode()[0],inplace=True)

train['NumberOfWindows'].replace(('   .'), '0',inplace=True)
test['NumberOfWindows'].replace(('   .'), '0', inplace=True)

In [None]:
# separating our features and label variables 
X = train.drop('Claim',axis=1)
y= train['Claim']

In [None]:
# declaring our categorical features 
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes == 'object')[0]

In [None]:
print(categorical_features_indices)

In [None]:
# Data splitting 
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.80, random_state=42)
test_id =  test['Customer Id']
X_test = test

In [None]:
# Basic model
model = CatBoostClassifier(
    custom_loss=['Logloss','AUC','Accuracy'],
    random_seed=42,
    logging_level='Silent'
)


model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

In [None]:
cv_params = model.get_params()
print(cv_params)

In [None]:
## attempting cross-validation

cv_params.update({'loss_function': metrics.Logloss() })

cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    nfold = 5,
    iterations= 1000,
    plot=True
)

In [None]:
cv_data.head()

In [None]:
print('Best validation AUC score is: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-AUC-mean']),
    cv_data['test-AUC-std'][np.argmax(cv_data['test-AUC-mean'])],
    np.argmax(cv_data['test-AUC-mean'])
))

In [None]:
print('Precise  AUC score: {}'.format(np.max(cv_data['test-AUC-mean'])))

In [None]:
# Model applying 
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)[:,1]
print(predictions[:10])
print(predictions_probs[:10])

In [None]:
#convert the predictions to pandas dataframe
submission = pd.DataFrame(data={'Customer Id':test_id,'Claim':predictions_probs})

submission.to_csv('Submission.csv',index=False)
                          
submission.head()

In [None]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric=metrics.AUC(),
        random_seed=42,
        verbose=False,
        loss_function=metrics.Logloss(),
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-AUC-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [None]:
from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

In [None]:
model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=1500,
    eval_metric=metrics.AUC(),
    random_seed=42,
    verbose=False,
    loss_function=metrics.Logloss(),
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

In [None]:
print('Precise validation AUC score: {}'.format(np.max(cv_data['test-AUC-mean'])))

In [None]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

In [None]:
# Model applying 
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)[:,1]
print(predictions[:10])
print(predictions_probs[:10])

In [None]:
#convert the predictions to pandas dataframe
submission = pd.DataFrame(data={'Customer Id':test_id,'Claim':predictions_probs})

submission.to_csv('Submission.csv',index=False)
                          
submission.head()