In [1]:
import os
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr


**Train Data Pre-Processing**

In [2]:
train_df = pd.read_feather('../input/amexfeather/train_data.ftr')
print("Train Dataset : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])
train_df = train_df.set_index('customer_ID', drop=True)
print("Train Dataset + Index : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])

train_df_columns = [col for col in train_df.columns]

Train Dataset : Rows = 5531451 , Columns =  191
Train Dataset + Index : Rows = 5531451 , Columns =  190


Dropping Columns with NaN Values more than 75%

In [3]:
minimum_null_count =  int(((100-25)/100)*train_df.shape[0] + 1)
train_df = train_df.dropna(axis=1, thresh=minimum_null_count)

train_df_without_null_columns = [col for col in train_df.columns]

In [4]:
train_df_removed_columns = [col for col in train_df_columns if col not in train_df_without_null_columns]
print("Train Dataset w/o Null : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])

train_df.drop(["S_2"], axis=1, inplace=True)
print("Train Dataset : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])

Train Dataset w/o Null : Rows = 5531451 , Columns =  157
Train Dataset : Rows = 5531451 , Columns =  156


Encoding Categorical Data

In [5]:
categorical_cols = []
for categorical_col in train_df.select_dtypes(include=['category','object']).columns:
    categorical_cols.append(categorical_col)

label_encoder = LabelEncoder()   
for categorical_col in categorical_cols:
    train_df[categorical_col] = label_encoder.fit_transform(train_df[categorical_col])

Replacing NaN Values

In [6]:
for col in categorical_cols:
    train_df[col] =  train_df[col].fillna(train_df[col].mode())

for col in train_df.columns:
    if (col not in categorical_cols):
        train_df[col] = train_df[col].fillna(train_df[col].median())

In [7]:
X_train = train_df.iloc[:, :-1]
print("X : Rows =", X_train.shape[0], ", Columns = ", X_train.shape[1])
y_train = train_df.iloc[:, -1:]
print("y : Rows =", y_train.shape[0], ", Columns = ", y_train.shape[1])

del train_df

X : Rows = 5531451 , Columns =  155
y : Rows = 5531451 , Columns =  1


Creating Features

In [8]:
X_1 = X_train.groupby('customer_ID').mean()
X_2 = X_train.groupby('customer_ID').max()
X_3 = X_train.groupby('customer_ID').min()
X_4 = X_train.groupby('customer_ID').tail(1)

In [9]:
X_1.columns += '_mean'
X_2.columns += '_max'
X_3.columns += '_min'
X_4.columns += '_latest'

In [10]:
X = pd.merge(X_1, X_2, on='customer_ID')
X = pd.merge(X, X_3, on='customer_ID')
X = pd.merge(X, X_4, on='customer_ID')

Removing columns if there are > 90% of correlations

In [11]:
# Remove columns if there are > 90% of correlations
correlation_matrix = X.corr()
col_core = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if(correlation_matrix.iloc[i, j] > 0.9):
            col = correlation_matrix.columns[i]
            col_core.add(col)

In [12]:
X = X.drop(col_core, axis=1)
for col in categorical_cols:
    X[col + "_mean"] = X[col + "_mean"].round(0).astype(int)

In [13]:
y_train = y_train.groupby('customer_ID').mean()
y_train = y_train.round(0).astype(int)

**Test Data Pre-Processing**

In [14]:
test_df = pd.read_feather('../input/amexfeather/test_data.ftr')
test_df = test_df.set_index('customer_ID', drop=True)
print("Test Dataset : Rows =", test_df.shape[0], ", Columns = ", test_df.shape[1])

test_df = test_df.drop(train_df_removed_columns, axis=1)
test_df.drop(["S_2"], axis=1, inplace=True)

Test Dataset : Rows = 11363762 , Columns =  189


In [15]:
label_encoder = LabelEncoder()   
for categorical_col in categorical_cols:
    test_df[categorical_col] = label_encoder.fit_transform(test_df[categorical_col])

In [16]:
for col in categorical_cols:
    test_df[col] =  test_df[col].fillna(test_df[col].mode())

for col in test_df.columns:
    if (col not in categorical_cols):
        test_df[col] = test_df[col].fillna(test_df[col].median())

In [17]:
test_X_1 = test_df.groupby('customer_ID').mean()
test_X_2 = test_df.groupby('customer_ID').max()
test_X_3 = test_df.groupby('customer_ID').min()
test_X_4 = test_df.groupby('customer_ID').tail(1)

test_X_1.columns = test_df.columns + '_mean'
test_X_2.columns = test_df.columns + '_max'
test_X_3.columns = test_df.columns + '_min'
test_X_4.columns = test_df.columns + '_latest'

X_test = pd.merge(test_X_1, test_X_2, on='customer_ID')
X_test = pd.merge(X_test, test_X_3, on='customer_ID')
X_test = pd.merge(X_test, test_X_4, on='customer_ID')

In [18]:
X_test = X_test.drop(col_core, axis=1)
for col in categorical_cols:
    X_test[col + "_mean"] = X_test[col + "_mean"].round(0).astype(int)

**Model Training and Inference**

In [41]:
categorical_features = []
for col in categorical_cols:
    for i in ["mean", "max", "min", "latest"]:
        if (col + "_" + i) not in col_core:
            categorical_features.append(col + "_" + i)

categorical_features = list(dict.fromkeys(categorical_features))

In [None]:
params = {'objective': 'binary','n_estimators': 1200,'metric': 'binary_logloss','boosting': 'gbdt','num_leaves': 90,'reg_lambda' : 50,'colsample_bytree': 0.19,'learning_rate': 0.03,'min_child_samples': 2400,'max_bins': 511,'seed': 42,'verbose': -1}
lgb_dataset = lgb.Dataset(X, label=y_train, categorical_feature=categorical_features)

# Train model with 100 iterations
lgb_model = lgb.train(params, lgb_dataset, 100)

In [44]:
predictions = lgb_model.predict(X_test)

output_df = pd.DataFrame({'customer_ID': X_test.index, 'prediction': predictions})
output_df.to_csv("LightGBM_predictions.csv", index=False)

**Validation**

In [None]:
# Hold Out Method

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X, y_train,test_size=0.3, random_state=42)

lgb_dataset_val = lgb.Dataset(X_train_val, label=y_train_val, categorical_feature=categorical_features)
lgb_model_val = lgb.train(params, lgb_dataset_val, 100)

In [None]:
predictions_val = lgb_model_val.predict(X_test_val)
accuracy_score(y_test_val, predictions_val)

In [None]:
# K-fold Cross-Validation

from sklearn.model_selection import cross_val_score

np.mean(cross_val_score(lgb_model, X, y_train, scoring='accuracy', cv=5))