In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import utils
from time import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix, f1_score


In [2]:
def split(X, y, test_size=0.2):
    X = X.to_numpy()
    y = y.to_numpy()
    sss = StratifiedShuffleSplit(test_size = test_size, random_state=0)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
    return (X_train, y_train, X_test, y_test)

In [3]:
def PlotConfusionMatrix(classifier, X, y, normalize='true', axes=None):
    '''
    Plots the confusion matrix of X based on predictions
    from classifier.
    
    @param classifier Classifier for doing predictions.
    @param X Input data.
    @param y Output labels.
    @param normalize Specifies the mode of normalization.
        It takes values 'true', 'pred' and 'all'.
    '''
    plot = plot_confusion_matrix(classifier, X, y, normalize = normalize,
                                 display_labels=CLASS_NAMES, cmap=plt.cm.Blues, ax=axes)

In [4]:
def TrainingPipeline(model, X_train, y_train, X_val, y_val):
    
    # A dictionary to store the history of the trained model.
    history = {}
    
    # Training and Predicting.
    model.fit(X_train, y_train)
    y_train_hat = model.predict(X_train)
    y_val_hat = model.predict(X_val)
    history["model"] = model
    history["y_train_hat"] = y_train_hat
    history["y_val_hat"] = y_val_hat
    
    # Plotting confusion matrices.
    fig, axes = plt.subplots(1, 2, figsize=(13,5))
    PlotConfusionMatrix(model, X_train, y_train, axes=axes[0])
    PlotConfusionMatrix(model, X_val, y_val, axes=axes[1])
    axes[0].set_title("Confustion matrix of Train Data")
    axes[1].set_title("Confustion matrix of Test Data")
    
    # Calculating the scores.
    train_score = utils.Score(f1_score, y_train, y_train_hat, average="weighted").round(4)
    val_score = utils.Score(f1_score, y_val, y_val_hat, average="weighted").round(4)
    history["train_score"] = train_score
    history["val_score"] = val_score
    
    print("Training Score is", train_score)
    print("Validation Score is", val_score)
    return history

In [5]:
train = pd.read_csv("train_age_dataset.csv")
test = pd.read_csv("test_age_dataset.csv")

In [6]:
CLASS_NAMES = list(train["age_group"].unique())

In [7]:
drop = ["Unnamed: 0" ]
train = utils.DropFeatures(drop, train)
test = utils.DropFeatures(drop, test)

In [8]:
features=train.columns
features

Index(['userId', 'tier', 'gender', 'following_rate', 'followers_avg_age',
       'following_avg_age', 'max_repetitive_punc',
       'num_of_hashtags_per_action', 'emoji_count_per_action',
       'punctuations_per_action', 'number_of_words_per_action',
       'avgCompletion', 'avgTimeSpent', 'avgDuration', 'avgComments',
       'creations', 'content_views', 'num_of_comments',
       'weekends_trails_watched_per_day', 'weekdays_trails_watched_per_day',
       'slot1_trails_watched_per_day', 'slot2_trails_watched_per_day',
       'slot3_trails_watched_per_day', 'slot4_trails_watched_per_day', 'avgt2',
       'age_group'],
      dtype='object')

In [9]:
X, y = utils.SplitLabels(train, "age_group")
# X_train, y_train, X_val, y_val = split(X, y)


In [10]:
X_train=X
y_train=y

In [11]:
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
# X_val = scaler.transform(X_val)
X_train=pd.DataFrame(X_train,columns=features[:-1])
X_train["new"]=X_train['userId']+X_train['creations']
X_train["new1"]=X_train['userId']+2*X_train['creations']
X_train["new2"]=2*X_train['userId']+X_train['creations']
X_train["total_slots"]=X_train['slot1_trails_watched_per_day']+X_train['slot2_trails_watched_per_day']+X_train['slot3_trails_watched_per_day']+X_train['slot4_trails_watched_per_day']

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=100,max_depth=12)
model.fit(X_train,y_train)



GradientBoostingClassifier(max_depth=12)

In [13]:
model.feature_importances_

array([2.74577134e-01, 1.81134025e-03, 2.28262835e-03, 1.01873551e-02,
       6.06503386e-03, 7.48681947e-03, 5.63363242e-03, 3.00225573e-04,
       1.02684342e-03, 3.91143396e-03, 1.83745584e-02, 1.55058110e-02,
       3.07009427e-03, 1.60526203e-02, 1.19052132e-03, 4.78562279e-01,
       1.79562450e-02, 3.40906924e-03, 6.24070303e-03, 9.54974391e-03,
       6.41821491e-03, 8.63438490e-03, 8.10256672e-03, 1.05789031e-02,
       1.67872475e-02, 1.17992455e-02, 1.24318872e-02, 2.54325713e-02,
       8.81183519e-03, 7.80905176e-03])

In [14]:
X_test=scaler.transform(test)
X_test=pd.DataFrame(X_test,columns=features[:-1])
X_test["new"]=X_test['userId']+X_test['creations']
X_test["new1"]=X_test['userId']+2*X_test['creations']
X_test["new2"]=2*X_test['userId']+X_test['creations']
X_test["total_slots"]=X_test['slot1_trails_watched_per_day']+X_test['slot2_trails_watched_per_day']+X_test['slot3_trails_watched_per_day']+X_test['slot4_trails_watched_per_day']
test_pred=model.predict(X_test)
test_pred = pd.Series(test_pred, name="prediction")
test_pred.to_csv("baseline.csv", index=False)

KeyError: "'weekdays_trails_watched_per_day'"