# Importing Libraries and Loading datasets

In [None]:
import gc
import numpy as np
import pandas as pd

# Plot
import seaborn as sns
import matplotlib.pyplot as plt

# Encoding
from sklearn.preprocessing import LabelEncoder

# Scaling
from sklearn.preprocessing import RobustScaler

# Neural Network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

# Cross-Validation
from sklearn.model_selection import StratifiedKFold

# Scoring
from sklearn.metrics import accuracy_score

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")

# Explore Data

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
print("Columns: \n{0}".format(list(train.columns)))

# Pseudo Labeling

Credits to [remekkinas](https://www.kaggle.com/remekkinas) for his notebook [TPS-12 NN (TPU) + Pseudolabeling](https://www.kaggle.com/remekkinas/tps-12-nn-tpu-pseudolabeling-0-95690) and his dataset [TPS-12 - Pseudolabels](https://www.kaggle.com/remekkinas/tps12-pseudolabels).

In [None]:
pseudo = pd.read_csv("../input/tps12-pseudolabels/tps12-pseudolabels_v2.csv")
train = pd.concat([train, pseudo], axis=0)
train.reset_index(drop=True)

In [None]:
del pseudo
gc.collect()

# Basic Data Check

In [None]:
print('Train data shape:', train.shape)
print('Test data shape:', test.shape)

## Missing values

In [None]:
missing_values_train = train.isna().any().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test.isna().any().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

## Duplicates

In [None]:
duplicates_train = train.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

In [None]:
del missing_values_train
del missing_values_test
del duplicates_train
del duplicates_test
gc.collect()

# Features

## Categorical Features

In [None]:
categorical_features = train.columns[11:-1:]
print("Categorical Columns: \n{0}".format(list(categorical_features)))

## Numerical Features

In [None]:
numerical_features = train.columns[1:11]
print("Numerical Columns: \n{0}".format(list(train.columns[1:11])))
train[numerical_features].describe()

## Target Distribution

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Target distribution')
ax = sns.countplot(x=train['Cover_Type'], data=train)

## Dropping rows and columns

In [None]:
cType5 = train[train['Cover_Type'] == 5].index
print("Number of rows with Cover_Type = 5: {0}".format(len(cType5)))

In [None]:
print("Unique values in Soil_Type7 column train data: {0}".format(train['Soil_Type7'].unique()))
print("Unique values in Soil_Type15 column train data: {0}".format(train['Soil_Type15'].unique()))

print("Unique values in Soil_Type7 column test data: {0}".format(test['Soil_Type7'].unique()))
print("Unique values in Soil_Type15 column test data: {0}".format(test['Soil_Type15'].unique()))

In [None]:
# Dropping the row Cover_Type = 5,
# causes problems during kfold (least populated class)
# Also, it appears there is no label 5 in test data, 
# Check out, https://www.kaggle.com/baekseungyun/tps-dec-there-is-no-label-5-in-test-data
train.drop(cType5, axis=0, inplace=True)

# Dropping columns Soil_Type7 and Soil_Type15, they are zero
train.drop(['Soil_Type7', 'Soil_Type15'], axis=1, inplace=True)
test.drop(['Soil_Type7', 'Soil_Type15'], axis=1, inplace=True)

In [None]:
del categorical_features
del cType5
del ax
gc.collect()

# Feature Engineering

Credits to [chryzal](https://www.kaggle.com/chryzal) for his notebook [🥇Features Engineering For You 🥇](https://www.kaggle.com/chryzal/features-engineering-for-you).

## Encoding labels

In [None]:
encoder = LabelEncoder()
train["Cover_Type"] = encoder.fit_transform(train["Cover_Type"])

## Arrange the range of `Aspect` column

Sets the `Aspect` columns' value range to 0 to 359.

In [None]:
for data in [train, test]:
    data["Aspect"][data["Aspect"] < 0] += 360
    data["Aspect"][data["Aspect"] > 359] -= 360

## Arrange the range of `Hillshade` columns

Sets the `Hillshade` columns' value range to 0 to 255.

In [None]:
for data in [train, test]:
    data.loc[data["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    data.loc[data["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    data.loc[data["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
    data.loc[data["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    data.loc[data["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    data.loc[data["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

## Creating distance based features

In [None]:
for data in [train, test]:
    # Manhhattan distance to Hydrology
    data["Manhhattan_Distance_To_Hydrology"] = np.abs(data["Horizontal_Distance_To_Hydrology"]) + np.abs(data["Vertical_Distance_To_Hydrology"])
    # Euclidean distance to Hydrology
    data["Euclidean_Distance_To_Hydrology"] = (data["Horizontal_Distance_To_Hydrology"]**2 + data["Vertical_Distance_To_Hydrology"]**2)**0.5

## Creating new features

Creating the following new features:  
* Sum of all the soil types
* Sum of all the wilderness area types

In [None]:
features_Hillshade = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
soil_features = [x for x in train.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in train.columns if x.startswith("Wilderness_Area")]

for data in [train, test]:
    # Thanks @mpwolke : https://www.kaggle.com/mpwolke/tooezy-where-are-you-no-camping-here
    data["Soil_Count"] = data[soil_features].apply(sum, axis=1)
    
    # Thanks @yannbarthelemy : https://www.kaggle.com/yannbarthelemy/tps-december-first-simple-feature-engineering
    data["Wilderness_Area_Count"] = data[wilderness_features].apply(sum, axis=1)
    data["Hillshade_mean"] = data[features_Hillshade].mean(axis=1)
    data['amp_Hillshade'] = data[features_Hillshade].max(axis=1) - data[features_Hillshade].min(axis=1)

In [None]:
cols = test.columns
for data in [train, test]:
    data['sum_na'] = data.isna().sum(axis = 1)
    data['mean'] = data[cols].mean(axis=1)
    data['min'] = data[cols].min(axis=1)
    data['max'] = data[cols].max(axis=1)

## Scaling features

In [None]:
new_features = [
    "Manhhattan_Distance_To_Hydrology",
    "Euclidean_Distance_To_Hydrology",
    "Soil_Count",
    "Wilderness_Area_Count",
    "Hillshade_mean",
    "amp_Hillshade",
    "sum_na",
    "mean",
    "min",
    "max"
]
features = np.concatenate((new_features, numerical_features))

scaler = RobustScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [None]:
del wilderness_features
del features_Hillshade
del numerical_features
del soil_features
del new_features
del features
del scaler
del cols
gc.collect()

# Reduce memory usage

This code snippet is taken from https://www.kaggle.com/desalegngeb/december-2021-tps-eda-models  
Originally https://www.kaggle.com/c/tabular-playground-series-oct-2021/discussion/275854

In [None]:
# This code snippet is taken from https://www.kaggle.com/desalegngeb/december-2021-tps-eda-models
# Originally https://www.kaggle.com/c/tabular-playground-series-oct-2021/discussion/275854
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# Modelling

In [None]:
UNITS = train["Cover_Type"].nunique()
TEST_ID = test.Id.copy()

In [None]:
# Get train data without the target and ids
X = train.drop(['Id', 'Cover_Type'], axis=1).copy()
# Get the target
y = train.Cover_Type.copy()
# Get the test data without ids
test_X = test.drop(['Id'], axis=1).copy()

In [None]:
del train
del test
gc.collect()

## Callbacks

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
early_stopping = callbacks.EarlyStopping(
    monitor="val_accuracy",     # Quantity to be monitored
    patience=20,                # How many epochs to wait before stopping
    restore_best_weights=True)

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ReduceLROnPlateau
reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.5,                # Factor by which the learning rate will be reduced
    patience=5)                # Number of epochs with no improvement

In [None]:
CALLBACKS = [early_stopping, reduce_lr]

## Model

In [None]:
# Credits to https://www.kaggle.com/samuelcortinhas/tps-dec-feat-eng-pseudolab-clean-version
N_SPLITS = 8
EPOCHS = 100
BATCH_SIZE = 250

In [None]:
# Credits to https://www.kaggle.com/chryzal/features-engineering-for-you
# Credits to https://www.kaggle.com/samuelcortinhas/tps-dec-feat-eng-pseudolab-clean-version
model = keras.Sequential([
    layers.Dense(units=256, kernel_initializer="lecun_normal", activation="selu", input_shape=[X.shape[1]]),
    layers.BatchNormalization(),
    layers.Dense(units=256, kernel_initializer="lecun_normal", activation="selu"),
    layers.BatchNormalization(),
    layers.Dense(units=128, kernel_initializer="lecun_normal", activation="selu"),
    layers.BatchNormalization(),
    layers.Dense(units=64, kernel_initializer="lecun_normal", activation="selu"),
    layers.BatchNormalization(),
    layers.Dense(units=UNITS, activation="softmax")])

In [None]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

## Training

In [None]:
fold = 1
scores = []
test_predictions = np.zeros((1, 1))
cv = StratifiedKFold(n_splits=N_SPLITS, random_state=48, shuffle=True)
for train_idx, test_idx in cv.split(X, y):
    train_X, val_X = X.iloc[train_idx], X.iloc[test_idx]
    train_y, val_y = y.iloc[train_idx], y.iloc[test_idx]

    model.fit(
        train_X, train_y,
        validation_data=(val_X, val_y),
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        callbacks=CALLBACKS,        # Put your callbacks in a list
        verbose=0)                  # Turn off training log

    predictions = np.argmax(model.predict(val_X), axis=1)
    score = accuracy_score(val_y, predictions)
    scores.append(score)
    print(f"Fold {fold} \t\t Accuracy: {score}")

    # Get the average values from each fold to the prediction
    test_predictions = test_predictions + model.predict(test_X)
    fold += 1
print('Overall Accuracy: ', np.mean(scores))

# Submission

In [None]:
test_predictions = np.argmax(test_predictions, axis=1)
test_predictions = encoder.inverse_transform(test_predictions)
output = pd.DataFrame({'Id': TEST_ID, 'Cover_Type': test_predictions})
output.to_csv('submission.csv', index=False)