In [None]:
# Import libraries and set plot style
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

# Explorative Data Analysis

In [None]:
# Read train dataset
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
# Get columns and rows
print("There are", df_train.shape[0], "rows and", df_train.shape[1], "columns.")

# Print all columns
print(df_train.columns)

# Divide into X and Y (features and target)
x_train = df_train.drop(columns=['target', 'id'])
y_train = df_train['target']

There are 25 feature columns with 10 categorical and 14 continuous variables. We get rid of the 'id' column since it is a unique identifier of each row.



## Target
Let's begin with plotting the distribution of our target variable. As it is one-dimensional, it is an easy way to have a first grasp of our data. 

We see that it looks like a normal distribution centered around 7.5 with a standard deviation of ~ 2.

In [None]:
y_train.hist(bins=20)

## Features
We have to types of features (categorical and continuous). Let's first focus on the continuous ones.

### Continuous features

Insights:
 - All values are roughly in the interval [0,1] but some of them lie beyond the boundaries.
 - cont4 seems to have a very sharp peak around 0.275
 - All other features behave really smooth except for con1, which has some sharp peaks.
 - There seems to be some correlated features (cont0-cont5, cont5-cont8, ...)
 

In [None]:
# Get continous columns
cont_cols = [col for col in x_train.columns if 'cont' in col]

# Subset desired columns
x_cont = x_train[cont_cols]

# Plot distributions
fig, ax = plt.subplots(figsize=(15,15))
sns.histplot(x_cont, stat='density', ax=ax)

In [None]:
# Correlation matrix
fig, ax = plt.subplots(figsize=(15,15))
colormap = sns.color_palette("Greens") 
correlation_mat = x_cont.corr()
sns.heatmap(correlation_mat, annot = False, ax=ax, cmap=colormap)

## Categorical features

- Number of categories is very different. There are three binary features and one feature with 15 possible categories.
- Each category occurs with a certain frequency but this does not provide us with any useful information so we omit the plots.
- Crámer's V correlation matrix shows that there are no big correlations between categorical features.

In [None]:
# Get desired columns
cat_cols = [col for col in x_train.columns if 'cat' in col]

# Subset
x_cat = x_train[cat_cols]

# Print all different categories
# Since there are many, I opted for printing the counts instead of plotting them.
for col in cat_cols:
    print(col, x_cat[col].unique())

In [None]:
# Cramer's V
from scipy import stats

def get_cramer_correlation(X, y):
    confusion_matrix = pd.crosstab(X, y)
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))


def get_cramer_correlation_matrix(X):
    # Get columns
    X_columns = X.columns
    n_cols = len(X_columns)

    # Initialize to zeros
    coefficients = np.zeros(shape=(n_cols, n_cols))
    for i in range(n_cols):
        for j in range(n_cols):
            coefficients[i, j] = get_cramer_correlation(X[X_columns[i]], X[X_columns[j]])

    return coefficients


def plot_cramer_correlation_matrix(X):
    # Get correlation matrix from features
    corr_matrix = get_cramer_correlation_matrix(X)
    
    # Plot
    cols = X.columns
    fig, ax = plt.subplots(1, 1)
    img = ax.imshow(corr_matrix, alpha=0.8, cmap='OrRd')
    ax.set_xticks(range(len(cols)))
    ax.set_yticks(range(len(cols)))
    ax.set_xticklabels(cols, rotation=90)
    ax.set_yticklabels(cols)
    fig.colorbar(img)

    return

plot_cramer_correlation_matrix(x_cat)

# Baseline model

With categorical encodings, let's create a first conversion of the categorical features into numerical. Then, we drop the categorical features and feed a LightGBM with the remaining features.

In [None]:
from category_encoders import MEstimateEncoder

def mee_encode(train_df, test_df, column):
    mee = MEstimateEncoder()
    new_feature = "{}_mee".format(column)
    mee.fit(train_df[column], train_df["target"])
    train_df[new_feature] = mee.transform(train_df[column])
    test_df[new_feature] = mee.transform(test_df[column])
    return new_feature

mee_features = []
for col in cat_cols:
    mee_features.append(mee_encode(df_train, df_test, col))

In [None]:
# Split features from target
df_train_baseline = df_train.drop(columns=cat_cols)
df_test_baseline = df_test.drop(columns=cat_cols)

X_train_baseline = df_train_baseline.drop(columns=['target', 'id'])
y_train_baseline = df_train['target']

In [None]:
# Train our model with some basic parameters
import xgboost as xgb

dtrain = xgb.DMatrix(X_train_baseline, label=y_train_baseline)
param = {'max_depth': 2, 'eta': 1, 'objective': 'reg:squarederror'}
param['eval_metric'] = 'rmse'

num_round = 1000
evallist = [(dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist)

In [None]:
# Predictions
df_test_x = df_test_baseline.drop(columns=['id'])
dtest = xgb.DMatrix(df_test_x)
predictions = bst.predict(dtest)

In [None]:
# Save submission
my_submission = pd.DataFrame(df_test.copy()['id'])
my_submission = my_submission.assign(target=predictions)
my_submission.to_csv('/kaggle/working/submission.csv', index=False)