In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from time import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix, f1_score


In [2]:
def SplitLabels(data, label):
    '''
    Split the dataset into inputs and labels.
    
    @param data The pandas dataframe which we need to split.
    @param label String value which is the name of output label.
    
    @return X Inputs
    @return y Outputs
    '''
    
    X = data.drop(label, axis=1)
    y = data[label]
    return X,y

def DropFeatures(features, data):
    '''
    Drops the list of features from data.
    
    @param features List of features to drop.
    @param data Dataframe from which we want to drop features.
    
    @return dropped_data The dataset with dropped features.
    '''
    
    dropped_data = data.drop(features, axis = 1)
    return dropped_data
    
def Score(score_func, y_true, y_pred, **args):
    '''
    Calculates the score of predictions.
    
    @param score_func The function which we need as a measure of score.
    @param y_true True labels.
    @param y_pred Predicted labels.
    @param args Additional arguments fed to the score_func.
    
    @return calc_score The score value calculated by the score_func.
    '''
    
    calc_score = score_func(y_true, y_pred, **args)
    return calc_score

In [3]:
def PlotConfusionMatrix(classifier, X, y, normalize='true', axes=None):
    '''
    Plots the confusion matrix of X based on predictions
    from classifier.
    
    @param classifier Classifier for doing predictions.
    @param X Input data.
    @param y Output labels.
    @param normalize Specifies the mode of normalization.
        It takes values 'true', 'pred' and 'all'.
    '''
    plot = plot_confusion_matrix(classifier, X, y, normalize = normalize,
                                 display_labels=CLASS_NAMES, cmap=plt.cm.Blues, ax=axes)

In [4]:
# Splits the dataset
def split(X, y, test_size=0.2):
    X = X.to_numpy()
    y = y.to_numpy()
    sss = StratifiedShuffleSplit(test_size = test_size, random_state=0)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
    return (X_train, y_train, X_test, y_test)

In [5]:
def TrainingPipeline(model, X_train, y_train, X_val, y_val):
    
    # A dictionary to store the history of the trained model.
    history = {}
    
    # Training and Predicting.
    model.fit(X_train, y_train)
    y_train_hat = model.predict(X_train)
    y_val_hat = model.predict(X_val)
    history["model"] = model
    history["y_train_hat"] = y_train_hat
    history["y_val_hat"] = y_val_hat
    
    # Plotting confusion matrices.
    fig, axes = plt.subplots(1, 2, figsize=(13,5))
    PlotConfusionMatrix(model, X_train, y_train, axes=axes[0])
    PlotConfusionMatrix(model, X_val, y_val, axes=axes[1])
    axes[0].set_title("Confustion matrix of Train Data")
    axes[1].set_title("Confustion matrix of Test Data")
    
    # Calculating the scores.
    train_score = Score(f1_score, y_train, y_train_hat, average="weighted").round(4)
    val_score = Score(f1_score, y_val, y_val_hat, average="weighted").round(4)
    history["train_score"] = train_score
    history["val_score"] = val_score
    
    print("Training Score is", train_score)
    print("Validation Score is", val_score)
    return history

In [6]:
train = pd.read_csv("data/train_age_dataset.csv")
test = pd.read_csv("data/test_age_dataset.csv")

In [7]:
# Names of classes
CLASS_NAMES = list(train["age_group"].unique())

In [8]:
X_train, y_train = SplitLabels(train, "age_group")

In [9]:
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=100,max_depth=12, random_state=0)
model.fit(X_train,y_train)

KeyboardInterrupt: 

In [None]:
# Making predictions.
test_pred = model.predict(test)
test_pred = pd.Series(test_pred, name="prediction")
test_pred.to_csv("final_model.csv", index=False)