In [53]:
# installing catboost
!pip install catboost



In [54]:
# Mounting my google drive on colab
from google.colab import drive
drive.mount('/content/mydrive')

Drive already mounted at /content/mydrive; to attempt to forcibly remount, call drive.mount("/content/mydrive", force_remount=True).


In [55]:
import numpy as np
import pandas as pd

In [56]:
train_data = pd.read_csv("mydrive/MyDrive/Cascade Cup 2020/data/train_age_dataset.csv")
test_data = pd.read_csv("mydrive/MyDrive/Cascade Cup 2020/data/test_age_dataset.csv")

In [57]:
#column_types = np.array(["r", "r", "c", "c", "n", "n", "n", "n", "r", "r", "n", "n", "n", "n", "n", "r", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "d"])
to_drop=['Unnamed: 0', 'userId', 'num_of_hashtags_per_action', 'emoji_count_per_action', 'avgComments', 'gender', 'punctuations_per_action', 'tier']
column_types = np.array(["n" for i in range(len(train_data.columns))])
for i, j in enumerate(train_data.columns):
  if j in to_drop:
    column_types[i] = 'r' 
column_types[-1] = "d"
print(column_types)

['r' 'r' 'r' 'r' 'n' 'n' 'n' 'n' 'r' 'r' 'r' 'n' 'n' 'n' 'n' 'r' 'n' 'n'
 'n' 'n' 'n' 'n' 'n' 'n' 'n' 'n' 'd']


In [58]:
column_dic = {"r": "redundant", 
              "c": "categorical",
              "n": "numeric",
              "d": "dependant"}

In [59]:
# splitting data into X's and Y's and removing redundant features
X = train_data.iloc[:, [x != "r" and x != "d" for x in column_types]]
Y = train_data.iloc[:, [x == "d" for x in column_types]]
testX = test_data.iloc[:, [x != "r" and x != "d" for x in column_types[:-1]]]
req_cols = column_types[[x != "r" and x != "d" for x in column_types]]

# creating a concatenated dataset for normalizing or standardization
concat = pd.concat([X, testX], axis=0)
X.shape, Y.shape, testX.shape, concat.shape, req_cols.shape

((488877, 18), (488877, 1), (54320, 18), (543197, 18), (18,))

In [60]:
# Creating a normalized version of data
def normalize(con, train, test):
    global req_cols

    train_fin = train.copy()
    test_fin = test.copy()

    max = con.max()
    min = con.min()

    is_num = (req_cols == "n")

    train_fin.loc[:, is_num] = (max[is_num] - train_fin.loc[:, is_num]) / (max[is_num] - min[is_num])
    test_fin.loc[:, is_num] = (max[is_num] - test_fin.loc[:, is_num]) / (max[is_num] - min[is_num])

    return train_fin, test_fin

# Creating a standardized version of data
def standardize(con, train, test):
    global req_cols

    train_fin = train.copy()
    test_fin = test.copy()

    mean = con.mean()
    var = con.var(ddof=0)

    is_num = (req_cols == "n")

    train_fin.loc[:, is_num] = (train_fin.loc[:, is_num] - mean[is_num]) / (var[is_num]**0.5)
    test_fin.loc[:, is_num] = (test_fin.loc[:, is_num] - mean[is_num]) / (var[is_num]**0.5)

    return train_fin, test_fin

In [61]:
X_norm, testX_norm = normalize(concat, X, testX)
X_stand, testX_stand = standardize(concat, X, testX)

In [62]:
# importing modules from sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [63]:
# first splitting into train and valid
trainX, validX, trainY, validY = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=123)
trainX.shape, validX.shape, trainY.shape, validY.shape

((439989, 18), (48888, 18), (439989, 1), (48888, 1))

In [64]:
# here we evaluate CatBoost
# model = CatBoostClassifier(task_type="GPU")
model = XGBClassifier()
model.fit(trainX, np.ravel(trainY))

# f1_score for training data
trainY_hat = model.predict(trainX)
train_f1 = f1_score(trainY, np.ravel(trainY_hat), average="weighted")
# f1_score for validation data
validY_hat = model.predict(validX)
valid_f1 = f1_score(validY, np.ravel(validY_hat), average="weighted")

print(train_f1)
print(valid_f1)

0.724773121825623
0.7199895812980711


In [66]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(trainX, np.ravel(trainY))

# f1_score for training data
trainY_hat = model.predict(trainX)
train_f1 = f1_score(trainY, np.ravel(trainY_hat), average="weighted")
# f1_score for validation data
validY_hat = model.predict(validX)
valid_f1 = f1_score(validY, np.ravel(validY_hat), average="weighted")

print(train_f1)
print(valid_f1)

0.7334942288214344
0.616731659797872


In [67]:
from lightgbm import LGBMClassifier
model = LGBMClassifier()
model.fit(trainX, np.ravel(trainY))

# f1_score for training data
trainY_hat = model.predict(trainX)
train_f1 = f1_score(trainY, np.ravel(trainY_hat), average="weighted")
# f1_score for validation data
validY_hat = model.predict(validX)
valid_f1 = f1_score(validY, np.ravel(validY_hat), average="weighted")

print(train_f1)
print(valid_f1)

0.7521388181500145
0.7429732524730877


In [None]:
'''
import tensorflow as tf
ohe = OneHotEncoder()
trainY_ohe = ohe.fit_transform(trainY).toarray()
validY_ohe = ohe.fit_transform(validY).toarray()

input = tf.keras.Input(shape=(trainX.shape[1],))
x = tf.keras.layers.Dense(64, activation="relu")(input)
x = tf.keras.layers.Dense(32, activation="relu")(x)
x = tf.keras.layers.Dense(4, activation="softmax")(x)
model = tf.keras.Model(inputs=input, outputs=x)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=tf.keras.optimizers.Adam())
model.fit(tf.convert_to_tensor(trainX.values), tf.convert_to_tensor(trainY_ohe), batch_size=64, epochs=10, validation_split=0.1)
preds = model.predict(trainX)
preds = ohe.inverse_transform(preds)
preds
'''

In [None]:
'''
def get_y_hats(models, data):
    trainY_hats = []
    for model in models:
        temp = model.predict(data)
        trainY_hats.append(np.ravel(temp))
    trainY_hats = np.array(trainY_hats)
    return trainY_hats
'''

In [None]:
'''
from collections import Counter
def predict_from_multiple(hats): # must be of the shape (num_models, num_samples)
    fin = np.zeros(hats.shape[1])
    for idx in range(hats.shape[1]):
        max_voted = list(dict(Counter(hats[:, idx]).most_common(1)).keys())[0]
        fin[idx] = max_voted
    return fin
'''

In [69]:
# now we will generate predictions for test data and export them
sample = pd.read_csv("mydrive/MyDrive/Cascade Cup 2020/data/sample_submission.csv")
# testY_hats = get_y_hats(models, testX)
# testY_hat = predict_from_multiple(testY_hats)
testY_hat = model.predict(testX)
sample[:] = testY_hat.reshape(-1, 1)
sample.to_csv("submission_7.csv", index=False)