# Dataset


In [3]:
# Imports


from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib

import math
import seaborn as sns
import warnings                   # To ignore the warnings
import pandas as pd
import numpy as np          # For mathematical calculations
import matplotlib.pyplot as plt  # For plotting graphs
from datetime import datetime    # To access datetime
from pandas import Series        # To work on series

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

from xgboost import XGBClassifier

# import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline
warnings.filterwarnings("ignore")


In [4]:
# Data extraction

train_layer_10 = pd.read_csv("../data/layer10/train_10.csv")
test_layer_10 = pd.read_csv("../data/layer10/test_10.csv")
valid_layer_10 = pd.read_csv("../data/layer10/valid_10.csv")


In [5]:
# Extracting x and y

x_train = train_layer_10.copy().drop(
    columns=["label_1", "label_2", "label_3", "label_4"]
)
x_valid = valid_layer_10.copy().drop(
    columns=["label_1", "label_2", "label_3", "label_4"]
)
x_feature_names = ["feature_" + str(i) for i in range(1, 769)]

id_train = train_layer_10["label_1"].to_frame()
age_train = train_layer_10["label_2"].to_frame()  # id has NaN
gender_train = train_layer_10["label_3"].to_frame()
accent_train = train_layer_10["label_4"].to_frame()  # Accent has bias to 6

id_valid = valid_layer_10["label_1"].to_frame()
age_valid = valid_layer_10["label_2"].to_frame()
gender_valid = valid_layer_10["label_3"].to_frame()
accent_valid = valid_layer_10["label_4"].to_frame()


In [6]:
# Scaling using RobustScaler
scaler_robust = RobustScaler()
scaler_robust.fit(x_train)

x_train_scaled = pd.DataFrame(
    scaler_robust.transform(x_train), columns=x_feature_names)
x_valid_scaled = pd.DataFrame(
    scaler_robust.transform(x_valid), columns=x_feature_names)

x_test = test_layer_10.copy().drop(
    columns=["ID"]
)
x_test_scaled = pd.DataFrame(
    scaler_robust.transform(x_test), columns=x_feature_names)


### Support Functions


In [7]:
def validate_model(model, x_valid, y_valid):
    y_pred = model.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print("Accuracy: ", accuracy)
    cm = confusion_matrix(y_valid, y_pred)
    print("Confusion matrix: ")
    print(cm)
    print("Precision, recall, f1-score: ")
    prfs_ = precision_recall_fscore_support(
        y_valid, y_pred, average="weighted")
    return prfs_, cm


In [8]:
def do_pca(train, valid, variance: float = None, n_components: int = None):
    pca_obj = PCA(n_components=0.95, svd_solver="full")
    if variance:
        pca_obj = PCA(n_components=variance, svd_solver="full")
    elif n_components:
        pca_obj = PCA(n_components=n_components)
    pca_obj.fit(train)
    n_components = pca_obj.components_.shape[0]

    x_train = pd.DataFrame(
        pca_obj.transform(train),
        columns=["feature_pca_" + str(i) for i in range(1, n_components + 1)],
    )
    x_valid = pd.DataFrame(
        pca_obj.transform(valid),
        columns=["feature_pca_" + str(i) for i in range(1, n_components + 1)],
    )

    return x_train, x_valid, n_components, pca_obj


In [9]:
def fit_and_score(classifier_models, x_train, x_valid, y_train, y_valid):
    """
    Fits and evaluates given classifier models.
    classifier_models : a dict of different Scikit-Learn classifier models
    x_train : training data (no labels)
    x_valid : validation data (no labels)
    y_train : training labels
    y_valid : validation labels
    """
    # Set random seed
    np.random.seed(42)
    # Make a dictionary to keep model scores
    classifier_model_scores = {}
    # Loop through classifier_models
    for name, model in classifier_models.items():
        print(f"Fitting {name}...")
        # Fit the model to the data
        model.fit(x_train, y_train)
        # Evaluate the model and append its score to classifier_model_scores
        classifier_model_scores[name] = model.score(x_valid, y_valid)
        print("Done fitting and scoring model.")
    return classifier_model_scores


In [10]:
def tune_hyperparameters(classifier_models, grid, x_train, x_valid, y_train, y_valid):
    """
    Fits and evaluates given classifier models.
    classifier_models : a dict of different Scikit-Learn classifier models
    grid : a dict of hyperparameters to tune
    x_train : training data (no labels)
    x_valid : validation data (no labels)
    y_train : training labels
    y_valid : validation labels
    """
    # Set random seed
    np.random.seed(42)
    # Make a dictionary to keep model scores
    classifier_model_scores = {}
    trained_searches = {}
    # Loop through classifier_models
    for name, model in classifier_models.items():
        # Setup random hyperparameter search for model
        rs_model = RandomizedSearchCV(
            estimator=model,
            param_distributions=grid,
            n_iter=20,
            cv=5,
            verbose=3,
            random_state=42,
            n_jobs=-1,
        )
        # Fit random hyperparameter search model
        fileName = name + "_rs_model" + ".pkl"
        rs_model.fit(x_train, y_train)
        # Export rs_model using joblib
        joblib.dump(rs_model, fileName)
        trained_searches[name] = rs_model

        # Evaluate the model and append its score to classifier_model_scores
        classifier_model_scores[name] = rs_model.score(x_valid, y_valid)
        print("classifier model scores", classifier_model_scores)
    return trained_searches


In [11]:
def write_test_pred_to_csv(model, x_test, csv_name="test_pred.csv"):
    y_pred = model.predict(x_test)
    y_pred = pd.DataFrame(y_pred, columns=["label"])
    y_pred.to_csv(csv_name, index=False)


In [12]:
def find_min_pca_components(
    x_train, x_valid, y_train, y_valid, threshold: float = 0.90
):
    # Find min number of components for 90% accuracy
    score = 1

    while score > threshold:
        x_train, x_valid, id_n_components, id_pca_cat = do_pca(
            x_train, x_valid, variance=0.95
        )

        print("Number of components: ", id_n_components)

        id_final = SVC()
        id_final.fit(x_train, y_train)

        precision_recall_fscore_svc_final, cm_final = validate_model(
            id_final, x_valid, y_valid
        )

        print("Precision, Recall and F1 Score:",
              precision_recall_fscore_svc_final)

        y_pred = id_final.predict(x_valid)
        score = accuracy_score(y_valid, y_pred)
        print("Accuracy: ", score)

    print("Final number of components: ", id_n_components)
    print("Final Precision, Recall and F1 Score:",
          precision_recall_fscore_svc_final)
    return id_n_components, id_pca_cat


# Label 1


use {'kernel': 'rbf', 'gamma': 0.001, 'degree': 5, 'C': 10}

In [11]:
# Initial Data
id_data_train_cat = pd.concat([x_train_scaled, id_train], axis=1)
id_data_valid_cat = pd.concat([x_valid_scaled, id_valid], axis=1)

# Remove rows with null values
id_data_cleaned_train_cat = id_data_train_cat.dropna()
id_data_cleaned_valid_cat = id_data_valid_cat.dropna()

# Separate X and y again
id_x_train_cat = id_data_cleaned_train_cat.drop(columns=["label_1"])
id_y_train_cat = id_data_cleaned_train_cat["label_1"]
id_x_valid_cat = id_data_cleaned_valid_cat.drop(columns=["label_1"])
id_y_valid_cat = id_data_cleaned_valid_cat["label_1"].to_frame()

id_x_test_cat = x_test_scaled


### Initial Model SVC


In [12]:
id_initial = SVC(kernel="rbf", C=20, gamma="scale")
id_initial.fit(id_x_train_cat, id_y_train_cat)


In [13]:
precision_recall_fscore_svc_initial, cm_initial = validate_model(
    id_initial, id_x_valid_cat, id_y_valid_cat
)

print("Precision, Recall and F1 Score:", precision_recall_fscore_svc_initial)
# Accuracy:  0.9573333333333334 ->


Accuracy:  0.968
Confusion matrix: 
[[12  0  0 ...  0  0  0]
 [ 0  9  0 ...  0  0  0]
 [ 0  0 12 ...  0  0  0]
 ...
 [ 0  0  0 ... 20  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (0.9710426720052107, 0.968, 0.9681729568549567, None)


In [14]:
id_initial_test_preds = id_initial.predict(id_x_test_cat)
pd.DataFrame(id_initial_test_preds, columns=["label_1"]).to_csv(
    "id_initial_test_preds.csv")


In [1]:
from tuner10 import get_pca_model
%load_ext autoreload
%autoreload 2


Function Initialised


In [13]:
model_params = {
    "C": 30,
    "gamma": "scale",
    "kernel": "rbf",
}

id_pca_model, id_pca_score, id_pca_obj = get_pca_model(
    "id", model_params, 0.99)


Final Accuracy:  0.968


### Data Processing - PCA


In [17]:
id_x_train_pca, id_x_valid_pca, id_n_components, id_pca_cat = do_pca(
    id_x_train_cat, id_x_valid_cat, n_components=70
)
id_x_test_pca = pd.DataFrame(
    id_pca_cat.transform(id_x_test_cat),
    columns=["feature_pca_" + str(i) for i in range(1, id_n_components + 1)],
)


In [78]:
print("Number of components: ", id_n_components)


Number of components:  9


### Final Model SVC


In [20]:
id_final = SVC(kernel="rbf", C=10, gamma=0.001, degree=5)
id_final.fit(id_x_train_pca, id_y_train_cat)


In [21]:
precision_recall_fscore_svc_final, cm_final = validate_model(
    id_final, id_x_valid_pca, id_y_valid_cat
)

print("Precision, Recall and F1 Score:", precision_recall_fscore_svc_final)


Accuracy:  0.9266666666666666
Confusion matrix: 
[[11  0  0 ...  0  0  0]
 [ 0  8  0 ...  0  0  0]
 [ 0  0 12 ...  0  0  0]
 ...
 [ 0  0  0 ... 20  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (0.9329381666264019, 0.9266666666666666, 0.9273365017324677, None)


In [None]:
# Find min number of components for 90% accuracy
score = 1

id_x_train_pca = id_x_train_cat
id_x_valid_pca = id_x_valid_cat

while score > 0.90:
    id_x_train_pca, id_x_valid_pca, id_n_components, id_pca_cat = do_pca(
        id_x_train_pca, id_x_valid_pca, variance=0.95
    )

    print("Number of components: ", id_n_components)

    id_final = SVC()
    id_final.fit(id_x_train_pca, id_y_train_cat)

    precision_recall_fscore_svc_final, cm_final = validate_model(
        id_final, id_x_valid_pca, id_y_valid_cat
    )

    print("Precision, Recall and F1 Score:", precision_recall_fscore_svc_final)

    y_pred = id_final.predict(id_x_valid_pca)
    score = accuracy_score(id_y_valid_cat, y_pred)
    print("Accuracy: ", score)

print("Final number of components: ", id_n_components)
print("Final Precision, Recall and F1 Score:",
      precision_recall_fscore_svc_final)


In [49]:
from tuner import tune_hyperparameters_svc
%load_ext autoreload
%autoreload 2


Function Initialised
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
# FUNCTION TUNER
id_grid = {
    "C": [18, 20, 30, 40, 50],
    "gamma": ["scale"],
    "kernel": ["rbf"],
}
final_accuracy, final_model, pca_obj = tune_hyperparameters_svc("id", id_grid)


Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [55]:
final_model.best_params_


{'kernel': 'rbf', 'gamma': 'scale', 'C': 30}

In [56]:
final_final_model = SVC(**final_model.best_params_)
final_final_model.fit(id_x_train_cat, id_y_train_cat)

precision_recall_fscore_svc_final_final, cm_final_final = validate_model(
    final_final_model, id_x_valid_cat, id_y_valid_cat
)


Accuracy:  0.9626666666666667
Confusion matrix: 
[[12  0  0 ...  0  0  0]
 [ 0  9  0 ...  0  0  0]
 [ 0  0 12 ...  0  0  0]
 ...
 [ 0  0  0 ... 20  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]
Precision, recall, f1-score: 


In [51]:
precision_recall_fscore_svc_final, cm_final = validate_model(
    final_model.best_estimator_, id_x_valid_pca, id_y_valid_cat
)

print("Precision, Recall and F1 Score:", precision_recall_fscore_svc_final)


Accuracy:  0.912
Confusion matrix: 
[[11  0  0 ...  0  0  0]
 [ 0  9  0 ...  0  0  0]
 [ 0  0 11 ...  0  0  0]
 ...
 [ 0  0  0 ... 20  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (0.9170892152853453, 0.912, 0.9122559054597893, None)


In [37]:
# hyperparameter tuning for svc
id_final_models = {
    "id_SVC": SVC(),
}

id_grid = {
    "C": [18, 20, 30, 40, 50],
    "gamma": ["scale"],
    "kernel": ["rbf"],
}


id_rs_model = tune_hyperparameters(
    id_final_models,
    id_grid,
    id_x_train_pca,
    id_x_valid_pca,
    id_y_train_cat,
    id_y_valid_cat,
)

id_precision_recall_fscore_svc_final, id_cm_final = validate_model(
    id_rs_model["id_SVC"].best_estimator_, id_x_valid_pca, id_y_valid_cat)

print(id_rs_model["id_SVC"].best_params_)

id_label_1_final_preds = id_rs_model["id_SVC"].best_estimator_.predict(
    id_x_test_pca_85)
pd.DataFrame(id_label_1_final_preds, columns=["label_1"]).to_csv(
    "id_label_1_final_preds.csv")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
classifier model scores {'id_SVC': 0.932}
Accuracy:  0.932
Confusion matrix: 
[[12  0  0 ...  0  0  0]
 [ 0  8  0 ...  0  0  0]
 [ 0  0 11 ...  0  0  0]
 ...
 [ 0  0  0 ... 20  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]
Precision, recall, f1-score: 
{'kernel': 'rbf', 'gamma': 'scale', 'C': 20}


NameError: name 'id_x_test_pca_85' is not defined

In [None]:
id_initial_test_preds = id_rs_model["id_SVC"].predict(id_x_test_cat)
pd.DataFrame(id_initial_test_preds, columns=["label_1"]).to_csv(
    "id_initial_test_preds.csv")


# Label 2


{'kernel': 'rbf', 'gamma': 0.001, 'degree': 5, 'C': 10}

In [14]:
# Handling NaN values in the age
# Combine X and y into a single DataFrame

age_data_train_cat = pd.concat([x_train_scaled, age_train], axis=1)
age_data_valid_cat = pd.concat([x_valid_scaled, age_valid], axis=1)

# Remove rows with null values
age_data_cleaned_train_cat = age_data_train_cat.dropna()
age_data_cleaned_valid_cat = age_data_valid_cat.dropna()

# Separate X and y again
age_x_train_cat = age_data_cleaned_train_cat.drop(columns=["label_2"])
age_y_train_cat = age_data_cleaned_train_cat["label_2"]
age_x_valid_cat = age_data_cleaned_valid_cat.drop(columns=["label_2"])
age_y_valid_cat = age_data_cleaned_valid_cat["label_2"].to_frame()

age_x_test_cat = x_test_scaled


In [16]:
age_x_test_cat = x_test_scaled

age_initial = SVC(kernel="rbf", C=20, gamma="scale")

age_initial.fit(age_x_train_cat, age_y_train_cat)

age_initial_test_preds = age_initial.predict(age_x_test_cat)
pd.DataFrame(age_initial_test_preds, columns=["label_2"]).to_csv(
    "age_initial_test_preds.csv")


In [95]:
n_pca, pca_cat = find_min_pca_components(
    age_x_train_cat, age_x_valid_cat, age_y_train_cat, age_y_valid_cat
)


Number of components:  307
Accuracy:  0.9089673913043478
Confusion matrix: 
[[ 34   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  66   0   0   3   0   1   0   1   0   0   0   0   0   0   0   0]
 [  1   0  44   0   1   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   2   0  65  10   0   1   0   0   1   0   0   0   0   0   0   0]
 [  0   1   0   2 110   1   1   0   0   0   0   0   0   0   0   0   0]
 [  0   1   0   1   7  69   0   0   1   2   0   0   0   0   0   0   0]
 [  1   0   0   0   1   1  43   0   0   0   0   0   0   0   0   0   0]
 [  1   0   0   0   2   0   0  41   0   1   0   0   0   0   0   0   0]
 [  0   0   0   0   3   0   0   0  45   0   0   0   0   0   0   0   0]
 [  1   2   0   0   1   0   1   0   0  60   0   0   0   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0   0  10   0   0   0   0   0   0]
 [  0   0   1   1   2   0   0   0   0   0   0  26   0   0   0   0   0]
 [  0   0   1   0   0   0   0   0   0   0   0   0  10   0   0   0   0]
 

In [96]:
age_x_train_pca, age_x_valid_pca, age_n_components, age_pca_cat = do_pca(
    age_x_train_cat, age_x_valid_cat, n_components=180
)


In [97]:
age_final = SVC()
age_final.fit(age_x_train_pca, age_y_train_cat)


In [98]:
age_precision_recall_fscore_svc_final, age_cm_final = validate_model(
    age_final, age_x_valid_pca, age_y_valid_cat
)

print("Precision, Recall and F1 Score:", age_precision_recall_fscore_svc_final)


Accuracy:  0.9021739130434783
Confusion matrix: 
[[ 34   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0]
 [  1  65   0   0   3   0   1   0   1   0   0   0   0   0   0   0   0]
 [  1   1  42   0   2   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   2   0  65   9   0   1   0   0   2   0   0   0   0   0   0   0]
 [  0   1   0   2 111   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0   0   0   1   8  69   0   1   1   1   0   0   0   0   0   0   0]
 [  1   0   0   0   1   1  43   0   0   0   0   0   0   0   0   0   0]
 [  1   0   0   0   1   0   0  41   1   1   0   0   0   0   0   0   0]
 [  0   0   0   0   2   0   0   0  46   0   0   0   0   0   0   0   0]
 [  1   2   0   0   3   0   1   0   0  58   0   0   0   0   0   0   0]
 [  0   0   0   0   2   0   0   0   0   0   9   0   0   0   0   0   0]
 [  0   0   1   1   3   0   0   0   0   0   0  25   0   0   0   0   0]
 [  0   0   1   0   0   0   0   0   0   0   0   0  10   0   0   0   0]
 [  0   0   0   0   0   0   

In [57]:
id_grid = {
    "C": [30],
    "gamma": ["scale"],
    "kernel": ["rbf", "poly"],
    "degree": [5, 6, 7],
}
final_accuracy, final_model, pca_obj = tune_hyperparameters_svc("age", id_grid)


Function Initialised
Fitting 5 folds for each of 6 candidates, totalling 30 fits


ValueError: y should be a 1d array, got an array of shape (736, 768) instead.

In [None]:
# hyperparameter tuning for svc
age_final_models = {
    "age_SVC": SVC(),
}

age_grid = {
    "C": [0.1, 1, 10],
    "gamma": [1, 0.1, 0.01],
    "kernel": ["rbf", "poly", "sigmoid"],
    "degree": [1, 2, 3, 4, 5],
}

age_grid = {
    "C": [0.1, 1],
    "gamma": [1, 0.01],
    "kernel": ["rbf"],
    "degree": [1, 2],
}


age_rs_model = tune_hyperparameters(
    age_final_models,
    age_grid,
    age_x_train_pca,
    age_x_valid_pca,
    age_y_train_cat,
    age_y_valid_cat,
)


# Label 3 - Gender


In [17]:
# Handling NaN values in the age
# Combine X and y into a single DataFrame

gender_data_train_cat = pd.concat([x_train_scaled, gender_train], axis=1)
gender_data_valid_cat = pd.concat([x_valid_scaled, gender_valid], axis=1)

# Remove rows with null values
gender_data_cleaned_train_cat = gender_data_train_cat.dropna()
gender_data_cleaned_valid_cat = gender_data_valid_cat.dropna()

# Separate X and y again
gender_x_train_cat = gender_data_cleaned_train_cat.drop(columns=["label_3"])
gender_y_train_cat = gender_data_cleaned_train_cat["label_3"]
gender_x_valid_cat = gender_data_cleaned_valid_cat.drop(columns=["label_3"])
gender_y_valid_cat = gender_data_cleaned_valid_cat["label_3"].to_frame()


In [18]:
gender_x_test_cat = x_test_scaled

gender_initial = SVC(kernel="rbf", C=20, gamma="scale")
gender_initial.fit(gender_x_train_cat, gender_y_train_cat)

gender_initial_test_preds = gender_initial.predict(gender_x_test_cat)
pd.DataFrame(gender_initial_test_preds, columns=["label_3"]).to_csv(
    "gender_initial_test_preds.csv")


In [101]:
gender_initial = SVC()
gender_initial.fit(gender_x_train_cat, gender_y_train_cat)


In [19]:
gender_precision_recall_fscore_svc_initial, gender_cm_initial = validate_model(
    gender_initial, gender_x_valid_cat, gender_y_valid_cat
)

print("Precision, Recall and F1 Score:",
      gender_precision_recall_fscore_svc_initial)


Accuracy:  1.0
Confusion matrix: 
[[142   0]
 [  0 608]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (1.0, 1.0, 1.0, None)


In [104]:
gender_n_pca, gender_pca_cat = find_min_pca_components(
    gender_x_train_cat,
    gender_x_valid_cat,
    gender_y_train_cat,
    gender_y_valid_cat,
    threshold=0.95,
)


Number of components:  308
Accuracy:  0.9973333333333333
Confusion matrix: 
[[141   1]
 [  1 607]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (0.9973333333333333, 0.9973333333333333, 0.9973333333333333, None)
Accuracy:  0.9973333333333333
Number of components:  189
Accuracy:  0.9973333333333333
Confusion matrix: 
[[141   1]
 [  1 607]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (0.9973333333333333, 0.9973333333333333, 0.9973333333333333, None)
Accuracy:  0.9973333333333333
Number of components:  128
Accuracy:  0.9973333333333333
Confusion matrix: 
[[140   2]
 [  0 608]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (0.9973420765027322, 0.9973333333333333, 0.9973260664500576, None)
Accuracy:  0.9973333333333333
Number of components:  92
Accuracy:  0.9946666666666667
Confusion matrix: 
[[139   3]
 [  1 607]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (0.9946607338017173, 0.9946666666666667, 0.9946521329001151, None)

In [105]:
gender_x_train_pca, gender_x_valid_pca, gender_n_components, gender_pca_cat = do_pca(
    gender_x_train_cat, gender_x_valid_cat, n_components=32
)


In [106]:
gender_final = SVC()
gender_final.fit(gender_x_train_pca, gender_y_train_cat)


In [107]:
gender_precision_recall_fscore_svc_final, gender_cm_final = validate_model(
    gender_final, gender_x_valid_pca, gender_y_valid_cat
)

print("Precision, Recall and F1 Score:",
      gender_precision_recall_fscore_svc_final)


Accuracy:  0.9933333333333333
Confusion matrix: 
[[137   5]
 [  0 608]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (0.9933877107123437, 0.9933333333333333, 0.993287246190472, None)


In [None]:
# hyperparameter tuning for svc
gender_final_models = {
    "gender_SVC": SVC(),
}

gender_grid = {
    "C": [0.1, 1, 10],
    "gamma": [1, 0.1, 0.01],
    "kernel": ["rbf", "poly", "sigmoid"],
    "degree": [1, 2, 3, 4, 5],
}

gender_grid = {
    "C": [0.1, 1],
    "gamma": [1, 0.01],
    "kernel": ["rbf"],
    "degree": [1, 2],
}


gender_rs_model = tune_hyperparameters(
    gender_final_models,
    gender_grid,
    gender_x_train_pca,
    gender_x_valid_pca,
    gender_y_train_cat,
    gender_y_valid_cat,
)


In [None]:
# validate model
gender_precision_recall_fscore_svc_final, gender_cm_final = validate_model(
    gender_rs_model._best_estimator, gender_x_valid_pca, gender_y_valid_cat
)

print("Precision, Recall and F1 Score:",
      gender_precision_recall_fscore_svc_final)


In [None]:
id_data_test_cat = pd.concat([x_test_scaled, id_test], axis=1)

# Remove rows with null values
id_data_cleaned_test_cat = id_data_test_cat.dropna()

# Separate X and y again
id_x_test_cat = id_data_cleaned_test_cat.drop(columns=["label_1"])
id_y_test_cat = id_data_cleaned_test_cat["label_1"].to_frame()


# Label 4


In [20]:
# Handling NaN values in the age
# Combine X and y into a single DataFrame

accent_data_train_cat = pd.concat([x_train_scaled, accent_train], axis=1)
accent_data_valid_cat = pd.concat([x_valid_scaled, accent_valid], axis=1)

# Remove rows with null values
accent_data_cleaned_train_cat = accent_data_train_cat.dropna()
accent_data_cleaned_valid_cat = accent_data_valid_cat.dropna()

# Separate X and y again
accent_x_train_cat = accent_data_cleaned_train_cat.drop(columns=["label_4"])
accent_y_train_cat = accent_data_cleaned_train_cat["label_4"]
accent_x_valid_cat = accent_data_cleaned_valid_cat.drop(columns=["label_4"])
accent_y_valid_cat = accent_data_cleaned_valid_cat["label_4"].to_frame()


In [21]:
accent_x_test_cat = x_test_scaled

accent_initial = SVC(kernel="rbf", C=20, gamma="scale")
accent_initial.fit(accent_x_train_cat, accent_y_train_cat)

accent_initial_test_preds = accent_initial.predict(accent_x_test_cat)
pd.DataFrame(accent_initial_test_preds, columns=["label_4"]).to_csv(
    "accent_initial_test_preds.csv")


In [22]:
accent_precision_recall_fscore_svc_initial, accent_cm_initial = validate_model(
    accent_initial, accent_x_valid_cat, accent_y_valid_cat)

print("Precision, Recall and F1 Score:",
      accent_precision_recall_fscore_svc_initial)


Accuracy:  0.98
Confusion matrix: 
[[ 21   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  11   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0  27   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   8   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0  13   0   2   0   0   0   0   0   0   0]
 [  0   0   0   0   0  10   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0 532   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   3  29   0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   0  18   0   0   0   0   0]
 [  0   0   0   0   0   0   3   0   0  14   0   0   0   0]
 [  0   0   0   0   0   0   1   0   0   0   9   0   0   0]
 [  0   0   0   0   0   0   1   0   0   0   0  10   0   0]
 [  0   0   0   0   0   0   2   0   0   0   0   0  24   0]
 [  0   0   0   0   0   0   1   0   0   0   0   0   0   9]]
Precision, recall, f1-score: 
Precision, Recall and F1 Score: (0.9805484460694699, 0.98, 0.9795468516173848, None)
