In [None]:
# Mounting my google drive on colab
from google.colab import drive
drive.mount('/content/mydrive')

Mounted at /content/mydrive


In [73]:
import numpy as np
import pandas as pd

In [74]:
train_data = pd.read_csv("mydrive/MyDrive/Cascade Cup 2020/data/train_age_dataset.csv")
test_data = pd.read_csv("mydrive/MyDrive/Cascade Cup 2020/data/test_age_dataset.csv")

In [75]:
column_types = np.array(["r", "r", "c", "c", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "d"])

In [76]:
column_dic = {"r": "redundant", 
              "c": "categorical",
              "n": "numeric",
              "d": "dependant"}

In [77]:
# splitting data into X's and Y's and removing redundant features
X = train_data.iloc[:, [x != "r" and x != "d" for x in column_types]]
Y = train_data.iloc[:, [x == "d" for x in column_types]]
testX = test_data.iloc[:, [x != "r" and x != "d" for x in column_types[:-1]]]
req_cols = column_types[[x != "r" and x != "d" for x in column_types]]

# creating a concatenated dataset for normalizing or standardization
concat = pd.concat([X, testX], axis=0)
X.shape, Y.shape, testX.shape, concat.shape, req_cols.shape

((488877, 24), (488877, 1), (54320, 24), (543197, 24), (24,))

In [78]:
# Creating a normalized version of data
def normalize(con, train, test):
    global req_cols

    train_fin = train.copy()
    test_fin = test.copy()

    max = con.max()
    min = con.min()

    is_num = (req_cols == "n")

    train_fin.loc[:, is_num] = (max[is_num] - train_fin.loc[:, is_num]) / (max[is_num] - min[is_num])
    test_fin.loc[:, is_num] = (max[is_num] - test_fin.loc[:, is_num]) / (max[is_num] - min[is_num])

    return train_fin, test_fin

# Creating a standardized version of data
def standardize(con, train, test):
    global req_cols

    train_fin = train.copy()
    test_fin = test.copy()

    mean = con.mean()
    var = con.var(ddof=0)

    is_num = (req_cols == "n")

    train_fin.loc[:, is_num] = (train_fin.loc[:, is_num] - mean[is_num]) / (var[is_num]**0.5)
    test_fin.loc[:, is_num] = (test_fin.loc[:, is_num] - mean[is_num]) / (var[is_num]**0.5)

    return train_fin, test_fin

In [80]:
X_norm, testX_norm = normalize(concat, X, testX)
X_stand, testX_stand = standardize(concat, X, testX)

In [72]:
# importing modules from sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

In [81]:
# first we will make models on raw / non processed data
# first splitting into train and valid
trainX, validX, trainY, validY = train_test_split(X, Y, test_size=0.1, stratify=Y)
trainX.shape, validX.shape, trainY.shape, validY.shape

((439989, 24), (48888, 24), (439989, 1), (48888, 1))

In [83]:
# fitting a RandomForestClassifier()
model = RandomForestClassifier()
model.fit(trainX, np.ravel(trainY))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [86]:
# f1 score for training data
trainY_hat = model.predict(trainX)
f1_score(trainY, trainY_hat, average="weighted")

1.0

In [87]:
# f1 score for validation data
validY_hat = model.predict(validX)
f1_score(validY, validY_hat, average="weighted")

0.7168830071135626

In [100]:
# now we will generate predictions for test data and export them
sample = pd.read_csv("mydrive/MyDrive/Cascade Cup 2020/data/sample_submission.csv")
testY_hat = model.predict(testX)
sample[:] = testY_hat.reshape(-1, 1)
sample.to_csv("submission_1.csv", index=False)