In [None]:
import os
import pandas as pd
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Train Data Pre-Processing**

In [None]:
train_df = pd.read_feather('../input/amexfeather/train_data.ftr')
print("Train Dataset : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])
train_df = train_df.set_index('customer_ID', drop=True)
print("Train Dataset + Index : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])

train_df_columns = [col for col in train_df.columns]

In [None]:
minimum_null_count =  int(((100-25)/100)*train_df.shape[0] + 1)
train_df = train_df.dropna(axis=1, thresh=minimum_null_count)

train_df_without_null_columns = [col for col in train_df.columns]

In [None]:
train_df_removed_columns = [col for col in train_df_columns if col not in train_df_without_null_columns]
print("Train Dataset w/o Null : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])

train_df.drop(["S_2"], axis=1, inplace=True)
print("Train Dataset : Rows =", train_df.shape[0], ", Columns = ", train_df.shape[1])

In [None]:
categorical_cols = []
for categorical_col in train_df.select_dtypes(include=['category','object']).columns:
    categorical_cols.append(categorical_col)

label_encoder = LabelEncoder()   
for categorical_col in categorical_cols:
    train_df[categorical_col] = label_encoder.fit_transform(train_df[categorical_col])

In [None]:
for col in categorical_cols:
    train_df[col] =  train_df[col].fillna(train_df[col].mode())

for col in train_df.columns:
    if (col not in categorical_cols):
        train_df[col] = train_df[col].fillna(train_df[col].median())

In [None]:
X_train = train_df.iloc[:, :-1]
print("X : Rows =", X_train.shape[0], ", Columns = ", X_train.shape[1])
y_train = train_df.iloc[:, -1:]
print("y : Rows =", y_train.shape[0], ", Columns = ", y_train.shape[1])

del train_df

In [None]:
X_1 = X_train.groupby('customer_ID').mean()
X_2 = X_train.groupby('customer_ID').max()
X_3 = X_train.groupby('customer_ID').min()
X_4 = X_train.groupby('customer_ID').tail(1)

In [None]:
X_1.columns += '_mean'
X_2.columns += '_max'
X_3.columns += '_min'
X_4.columns += '_latest'

In [None]:
X = pd.merge(X_1, X_2, on='customer_ID')
X = pd.merge(X, X_3, on='customer_ID')
X = pd.merge(X, X_4, on='customer_ID')

In [None]:
# Remove columns if there are > 90% of correlations
correlation_matrix = X.corr()
col_core = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if(correlation_matrix.iloc[i, j] > 0.9):
            col = correlation_matrix.columns[i]
            col_core.add(col)

In [None]:
X = X.drop(col_core, axis=1)
for col in categorical_cols:
    X[col + "_mean"] = X[col + "_mean"].round(0).astype(int)

In [None]:
y_train = y_train.groupby('customer_ID').mean()
y_train = y_train.round(0).astype(int)

**Test Data Pre-Processing**

In [None]:
test_df = pd.read_feather('../input/amexfeather/test_data.ftr')
test_df = test_df.set_index('customer_ID', drop=True)
print("Test Dataset : Rows =", test_df.shape[0], ", Columns = ", test_df.shape[1])

test_df = test_df.drop(train_df_removed_columns, axis=1)
test_df.drop(["S_2"], axis=1, inplace=True)

In [None]:
label_encoder = LabelEncoder()   
for categorical_col in categorical_cols:
    test_df[categorical_col] = label_encoder.fit_transform(test_df[categorical_col])

In [None]:
for col in categorical_cols:
    test_df[col] =  test_df[col].fillna(test_df[col].mode())

for col in test_df.columns:
    if (col not in categorical_cols):
        test_df[col] = test_df[col].fillna(test_df[col].median())

In [None]:
test_X_1 = test_df.groupby('customer_ID').mean()
test_X_2 = test_df.groupby('customer_ID').max()
test_X_3 = test_df.groupby('customer_ID').min()
test_X_4 = test_df.groupby('customer_ID').tail(1)

test_X_1.columns = test_df.columns + '_mean'
test_X_2.columns = test_df.columns + '_max'
test_X_3.columns = test_df.columns + '_min'
test_X_4.columns = test_df.columns + '_latest'

X_test = pd.merge(test_X_1, test_X_2, on='customer_ID')
X_test = pd.merge(X_test, test_X_3, on='customer_ID')
X_test = pd.merge(X_test, test_X_4, on='customer_ID')

In [None]:
X_test = X_test.drop(col_core, axis=1)
for col in categorical_cols:
    X_test[col + "_mean"] = X_test[col + "_mean"].round(0).astype(int)

**Model Training and Inference**

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=7)
knn_model.fit(X, y_train)

In [None]:
predictions = knn_model.predict(X_test)

output_df = pd.DataFrame({'customer_ID': X_test.index, 'prediction': predictions})
output_df.to_csv("KNN_predictions.csv", index=False)