In [1]:
from google.colab import drive
drive.mount('/drive')

PATH = '/drive/My Drive/defaults/'

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


VAL_SIZE   = 0.20
RANDOM_SEED = 42


def getFeaturesInfo(df):
    # num of unique values, first 10 unique values, null values count, type
    df_agg = df.agg({'nunique', lambda s: s.unique()[:10]})\
        .append(pd.Series(df.isnull().sum(), name='null'))\
        .append(pd.Series(df.dtypes, name='dtype'))\
        .transpose()

    return df_agg


def SortFeatures(cols):
    bin_cols = []
    cat_cols = []
    num_cols = []
    
    for col in cols:
        if 'empty' in col:
            bin_cols.append(col)
        elif 'Category' in col or 'MONTH' in col:
            cat_cols.append(col)
        else:
            num_cols.append(col)
            
    num_cols.remove('incident')
    num_cols.remove('flg_90_12_add')
    num_cols.remove('default')
        
    return bin_cols, cat_cols, num_cols


def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))


data = pd.read_csv(PATH+'data.csv')
model = pd.read_csv(PATH+'model.csv')
sample = pd.read_csv(PATH+'sample_submission.csv', header=None)

data['default'] = model['PD']

data.dropna(thresh=len(data)//2, axis=1, inplace=True)
isna_cols = data.isna().sum()[data.isna().sum() > 0].index.tolist()

for col in isna_cols:
    if col == 'flg_90_12_add' or col == 'default':
        pass
    else:
        new_col = col + '_empty'
        data[new_col] = pd.isna(data[col]).astype('uint8')
        data[col].fillna((data[col].median()), inplace=True)

        
features = getFeaturesInfo(data)
columns = data.columns.tolist()

bin_cols, cat_cols, num_cols = SortFeatures(columns)

# dealing with outliers
for col in num_cols:
    median = data[col].median()
    IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
    perc25 = data[col].quantile(0.25)
    perc75 = data[col].quantile(0.75)
    
    # replace outliers with border-values
    data[col] = np.where(data[col] > (perc75 + 1.5*IQR), (perc75 + 1.5*IQR), data[col])
    data[col] = np.where(data[col] < (perc25 - 1.5*IQR), (perc25 - 1.5*IQR), data[col])
    
    
data['APPLICATION_MONTH'] = data['APPLICATION_MONTH'].astype('category').cat.codes

# exclude cols with multi-corr > 0.8 (no such cols btw)
# data_temp = data[num_cols].head(10000).copy()
# corr = data_temp.corr().abs()
# upper_tri = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
# high_corr_cols = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]
# num_cols = [x for x in num_cols if x not in high_corr_cols]
# data.drop(high_corr_cols, axis=1, inplace=True)

data['target'] = data['flg_90_12_add'] - data['default']
data['target'] = np.abs(data['target'])
data.drop(['flg_90_12_add', 'default'], axis=1, inplace=True)

scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols].values)

data = pd.get_dummies(data, prefix=cat_cols, columns=cat_cols)

train = data[:81617]
test = data[81617:]

X = train.drop(['target'], axis=1)
y = train['target']
X_test = test.drop(['target'], axis=1)


X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                      y,
                                                      test_size=VAL_SIZE,
                                                      shuffle=True,
                                                      random_state=RANDOM_SEED)


In [3]:
from catboost import CatBoostRegressor

start = datetime.now()

cb = CatBoostRegressor(iterations = 1000,
                       learning_rate=0.1,
                       random_seed = RANDOM_SEED,
                       loss_function='RMSE',
                       eval_metric='MAPE',
                       custom_metric=['R2', 'MAE'],
                       silent=True)

cb.fit(X_train,
      y_train,
      eval_set=(X_valid, y_valid),
      verbose_eval=0,
      use_best_model=True)

cb.save_model(PATH+'catboost_single_model_baseline.model')

# Оцениваем точность
predict_cb = cb.predict(X_valid)
print('\nВремя выполнения - ', datetime.now() - start)
print(f"Точность модели по метрике MAPE: {(mape(y_valid, predict_cb))*100:0.2f}%")

# MAPE = 63.54% / 1.5m
# 156.15 minmax 152.43 standard
# 142.15 on learning rate 0.1


Время выполнения -  0:03:33.182416
Точность модели по метрике MAPE: 151.34%


In [4]:
# CB Regressor
y_test = cb.predict(X_test)
submission = pd.DataFrame(np.abs(y_test))
submission.to_csv(PATH+'submission_cb.csv', index=False)
# Open MAE 0.10216
# Open MAE on all dataset 0.07792
# with setup for CatBoost 0.07328

submission.head()

Unnamed: 0,0
0,0.007934
1,0.07516
2,0.0288
3,0.146813
4,0.086536
