In [1]:
!pip install catboost

[0m

In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr


In [3]:
train_df = pd.read_feather('../input/amexfeather/train_data.ftr')
print("Train Dataset : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])
train_df = train_df.set_index('customer_ID', drop=True)
print("Train Dataset : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])

Train Dataset : Rows = 5531451 , Columns =  191
Train Dataset : Rows = 5531451 , Columns =  190


In [4]:
categorical_feature_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
numerical_feature_cols = [col for col in train_df.columns if col not in categorical_feature_cols + ["target"]]
numerical_feature_cols.remove('S_2')

In [5]:
# Columns which contain null values > 80%
removable_feature_cols = np.array(['S_2','D_66','D_42','D_49','D_73','D_76','R_9','B_29','D_87','D_88','D_106','R_26','D_108','D_110','D_111','B_39','B_42','D_132','D_134','D_135','D_136','D_137','D_138','D_142'])

train_df = train_df.drop(removable_feature_cols, axis=1)
print("Train Dataset : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])

Train Dataset : Rows = 5531451 , Columns =  166


In [6]:
numerical_feature_cols_w_NaN = np.array(['P_2','S_3','B_2','D_41','D_43','B_3','D_44','D_45','D_46','D_48','D_50','D_53','S_7','D_56','S_9','B_6','B_8','D_52','P_3','D_54','D_55','B_13','D_59','D_61','B_15','D_62','B_16','B_17','D_77','B_19','B_20','D_69','B_22','D_70','D_72','D_74','R_7','B_25','B_26','D_78','D_79','D_80','B_27','D_81','R_12','D_82','D_105','S_27','D_83','R_14','D_84','D_86','R_20','B_33','D_89','D_91','S_22','S_23','S_24','S_25','S_26','D_102','D_103','D_104','D_107','B_37','R_27','D_109','D_112','B_40','D_113','D_115','D_118','D_119','D_121','D_122','D_123','D_124','D_125','D_128','D_129','B_41','D_130','D_131','D_133','D_139','D_140','D_141','D_143','D_144','D_145'])

for col in numerical_feature_cols_w_NaN:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    
categorical_feature_cols_w_NaN = np.array(['D_68','B_30','B_38','D_64','D_114','D_116','D_117','D_120','D_126'])

for col in categorical_feature_cols_w_NaN:
    train_df[col] =  train_df[col].fillna(train_df[col].mode()[0])

In [7]:
# print(train_df.isnull().sum().to_string()) # Check the existence of NaN values
X_train = train_df.iloc[:, :-1]
print("X : Rows =", X_train.shape[0], ", Columns = ", X_train.shape[1])
y_train = train_df.iloc[:, -1:]
print("y : Rows =", y_train.shape[0], ", Columns = ", y_train.shape[1])

X : Rows = 5531451 , Columns =  165
y : Rows = 5531451 , Columns =  1


In [8]:
label_encoder = LabelEncoder()
categorical_feature_cols.remove("D_66")
for col in categorical_feature_cols:
    X_train[col] = label_encoder.fit_transform(X_train[col])

In [9]:
X_train = X_train.groupby('customer_ID').mean()

for col in categorical_feature_cols:
    X_train[col] = X_train[col].round(0).astype(int)
    
y_train = y_train.groupby('customer_ID').mean()
y_train = y_train.round(0).astype(int)

In [10]:
test_df = pd.read_feather('../input/amexfeather/test_data.ftr')
print("Test Dataset : Rows =", test_df.shape[0], ", Columns = ", test_df.shape[1])

Test Dataset : Rows = 11363762 , Columns =  190


In [11]:
test_df = test_df.set_index('customer_ID', drop=True)
removable_feature_cols = ['S_2','D_42','D_49','D_66','D_73','D_76','R_9','B_29','D_87','D_88','D_106','R_26','D_108','D_110','D_111','B_39','B_42','D_132','D_134','D_135','D_136','D_137','D_138','D_142']
numerical_feature_cols_w_NaN = ['P_2','S_3','B_2','D_41','D_43','B_3','D_44','D_45','D_46','D_48','D_50','D_53','S_7','D_56','S_9','S_12','S_17','B_6','B_8','D_52','P_3','D_54','D_55','B_13','D_59','D_61','B_15','D_62','B_16','B_17','D_77','B_19','B_20','D_69','B_22','D_70','D_72','D_74','R_7','B_25','B_26','D_78','D_79','D_80','B_27','D_81','R_12','D_82','D_105','S_27','D_83','R_14','D_84','D_86','R_20','B_33','D_89','D_91','S_22','S_23','S_24','S_25','S_26','D_102','D_103','D_104','D_107','B_37','R_27','D_109','D_112','B_40','D_113','D_115','D_118','D_119','D_121','D_122','D_123','D_124','D_125','D_128','D_129','B_41','D_130','D_131','D_133','D_139','D_140','D_141','D_143','D_144','D_145']
categorical_feature_cols_w_NaN = ['D_68','B_30','B_38','D_114','D_116','D_117','D_120','D_126']
test_df = test_df.drop(removable_feature_cols, axis=1)
for col in numerical_feature_cols_w_NaN:
    test_df[col] = test_df[col].fillna(test_df[col].median())
for col in categorical_feature_cols_w_NaN:
    test_df[col] =  test_df[col].fillna(test_df[col].mode()[0])
print("Test Dataset : Rows =", test_df.shape[0], ", Columns = ", test_df.shape[1])

Test Dataset : Rows = 11363762 , Columns =  165


In [12]:
# print(test_df.isnull().sum().to_string())
for col in categorical_feature_cols:
    test_df[col] = label_encoder.fit_transform(test_df[col])
test_df = test_df.groupby('customer_ID').mean()
for col in categorical_feature_cols:
    test_df[col] = test_df[col].round(0).astype(int)

In [13]:
# Remove columns if there are > 90% of correlations
correlation_matrix = X_train.corr()
col_core = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if(correlation_matrix.iloc[i, j] > 0.9):
            col = correlation_matrix.columns[i]
            col_core.add(col)

In [14]:
X_train = X_train.drop(col_core, axis=1)
test_df = test_df.drop(col_core, axis=1)

**Model Training & Inference, and Bagging**

In [16]:
cat_features = ["B_30",  "B_38", "D_114",  "D_116",  "D_117", "D_120",  "D_126",  "D_63", "D_64", "D_68"]
predictions = []

for i in range(10):
    classifier = CatBoostClassifier(learning_rate=0.03, iterations=1500, random_seed=i, logging_level='Silent')
    classifier.fit(X_train, y_train, cat_features=cat_features)
    predictions.append(classifier.predict_proba(test_df)[:,1])

    print("Iterations Completed :", i)

Iterations Completed : 0
Iterations Completed : 1
Iterations Completed : 2
Iterations Completed : 3
Iterations Completed : 4
Iterations Completed : 5
Iterations Completed : 6
Iterations Completed : 7
Iterations Completed : 8
Iterations Completed : 9


In [17]:
mean_predictions = np.mean(predictions, axis=0)
df_predictions = pd.DataFrame(mean_predictions, columns = ["prediction"])
df_ids = test_df.index.to_frame()
df_ids = df_ids.reset_index(drop=True)
df = pd.concat([df_ids, df_predictions], axis=1)
df.to_csv("CatBoost_predictions.csv", index=False)