## 1. Importing Packages

In [102]:
import re
import os
import random
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import mlflow
from datetime import datetime
import matplotlib.pyplot as plt
# from prettytable import PrettyTable
# from gensim.models import KeyedVectors
# from nltk.tokenize import word_tokenize
# from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

## 2. Utility Functions

In [103]:
def vectorizing_to_word2vec(X, word2vec_model):
    # vectorizing comment in data using word2vec embeddings
    start = datetime.now()
    word2vec_data = []
    for comment in X:
        vector = np.zeros(300)
        for word in comment.split():
            if word in word2vec_model:     
                vec = word2vec_model[word]
            else:
                vec = np.zeros(300)
            vector += vec
        word2vec_data.append(vector)
    word2vec_data = np.array(word2vec_data)
    print("Time taken: ", datetime.now() - start)
    return word2vec_data

In [104]:
def vectorizing_to_glove(X, glove_model):
    # vectorizing comment in data using glove embeddings
    start = datetime.now()
    glove_data = []
    for comment in X:
        vector = np.zeros(300)
        for word in comment.split():
            if word in glove_model.keys():
                vector += glove_model[word]
        glove_data.append(vector)
    glove_data = np.array(glove_data)
    print("Time taken: ", datetime.now() - start)
    return glove_data

In [105]:
def vectorizing_to_fasttext(X, fasttext_model):
    # vectorizing comment in data using fast embeddings
    start = datetime.now()
    fasttext_data = []
    for comment in X:
        vector = np.zeros(300)
        for word in comment.split():
            vec = fasttext_model.get_word_vector(word)
            vector += vec
        fasttext_data.append(vector)
    fasttext_data = np.array(fasttext_data)
    print("Time taken: ", datetime.now() - start)
    return fasttext_data

In [106]:
def save_embeddings(save_folder, file_name, embeddings):
    try:
        SAVE_PATH = os.path.join(save_folder, file_name)
        with open(SAVE_PATH, 'wb') as f:
            pickle.dump(embeddings, f)
        print(f"Successfully saved at : {SAVE_PATH}")
    except Exception as e:
        raise e

In [107]:
def load_embeddings(save_folder, file_name):
    try:
        SAVE_PATH = os.path.join(save_folder, file_name)
        with open(SAVE_PATH, 'rb') as f:
            word2vec_data = pickle.load(f)
        print(f"Loaded from : {SAVE_PATH}")
        return word2vec_data
    except Exception as e:
        raise e

In [108]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

## 3. Load the Dataset

In [109]:
DATA_PATH = "data"
SAVE_PATH = "data"

In [110]:
train_df = pd.read_csv(os.path.join(DATA_PATH, "train_clean_data.csv"))
train_df.head()

Unnamed: 0,ArticleId,Text,Category,clean_text,Label
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex boss launches defence lawyers defe...,0
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens maj...,0
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...,1
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses payout eighteen former enron dire...,0


In [111]:
X = train_df.drop(['ArticleId', 'Label','Text','Category'], axis=1)
y = train_df['Label'].values

## 4. Cross Validation

In [112]:
def stratified_k_fold_cv_f1(model, X, y, n_splits=5, shuffle=True, random_state=98):
    """
    Perform Stratified K-Fold Cross Validation and return the average F1 score.
    
    Parameters:
    - model: The machine learning model to evaluate.
    - X: The feature matrix (data).
    - y: The target vector (labels).
    - n_splits: Number of splits/folds for cross-validation (default is 5).
    - shuffle: Whether to shuffle the data before splitting (default is True).
    - random_state: Seed for random number generator to ensure reproducibility.
    
    Returns:
    - avg_f1: The average F1 score across all folds.
    """
    
    # Initialize StratifiedKFold with specified number of splits and options
    skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    
    f1_scores = []  # List to store F1 score for each fold
    
    # Split the data into train and test sets for each fold
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train the model on the training data
        model.fit(X_train, y_train)
        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Calculate and store the F1 score for this fold
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
    
    # Calculate the average F1 score across all folds
    avg_f1 = sum(f1_scores) / len(f1_scores)
    avg_std_f1 = np.std(f1_scores)
    return avg_f1, avg_std_f1

In [113]:
def train_performance(model, X, y, n_splits=5, shuffle=True, random_state=98):
    avg_f1, std_f1 = stratified_k_fold_cv_f1(model, X, y, n_splits, shuffle, random_state)
    print(f"Train Avg F1 Score: {avg_f1} Std: {std_f1}")

## 5. Word2Vec Encoding

In [114]:
# pretrained word2vec embeddings
# WORK2VEC_EMBEDDING_PATH = "/kaggle/input/google-word2vec/GoogleNews-vectors-negative300.bin"
# word2vec_model = KeyedVectors.load_word2vec_format(WORK2VEC_EMBEDDING_PATH, binary=True)

### 5.1. Vectorizing text using Word2Vec embeddings

In [115]:
# word2vec_data = vectorizing_to_word2vec(X['clean_text'], word2vec_model)

In [116]:
# save_embeddings(SAVE_PATH, 'word2vec_embeddings.pkl', word2vec_data)

In [117]:
# word2vec_data.shape

### 5.2. Loading Saved word2vec embeddings

In [118]:
word2vec_data = load_embeddings(SAVE_PATH, 'word2vec_embeddings.pkl')
word2vec_data.shape

Loaded from : data/word2vec_embeddings.pkl


(1490, 300)

### 5.3 Scaling Word2Vec Embeddings

In [119]:
# preprocessing data to standardize data with mean 0
scaler = StandardScaler()
word2vec_std_scaled = scaler.fit_transform(word2vec_data)
print('Data Shpe:', word2vec_std_scaled.shape)

# preprocessing data to convert to range 0-1
scaler = MinMaxScaler()
word2vec_minmax_scaled = scaler.fit_transform(word2vec_data)
print('Data Shpe:', word2vec_minmax_scaled.shape)

Data Shpe: (1490, 300)
Data Shpe: (1490, 300)


In [121]:
mlflow.set_tracking_uri("/Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/bbc-news-sorting/notebooks/mlruns")

### 5.4 Logistic Regression on Word2Vec Embeddings

In [125]:
%time
# Start an experiment (if not already created)
experiment_name = "Word2Vec_Embeddings"
mlflow.set_experiment(experiment_name)

model_name = "Logistic Regression"
model_save_name = "logistic_regression_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = LogisticRegression(max_iter=2000)

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, word2vec_std_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs
Logged Logistic Regression model with avg f1 score: 0.9684976075457431
Logged Logistic Regression model with avg f1 std: 0.006185096595692143


### 5.5 Naive Bayes on word2vec embeddings

In [126]:
%time
model_name = "Naive Bayes"
model_save_name = "naive_bayes_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = MultinomialNB()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, word2vec_minmax_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.15 µs
Logged Naive Bayes model with avg f1 score: 0.8390521476927397
Logged Naive Bayes model with avg f1 std: 0.027083752273604995


### 5.6 SVC on word2vec embeddings

In [127]:
%time
model_name = "SVC"
model_save_name = "svc_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = SVC()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, word2vec_minmax_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.86 µs
Logged SVC model with avg f1 score: 0.9495994624826165
Logged SVC model with avg f1 std: 0.012994664045244868


### 5.7 Random Forest on word2vec embeddings

In [128]:
%time
model_name = "Random Forest"
model_save_name = "random_forest_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = RandomForestClassifier()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, word2vec_minmax_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs
Logged Random Forest model with avg f1 score: 0.9516302673406869
Logged Random Forest model with avg f1 std: 0.015207650281093953


### 5.8 Gradient Boost on word2vec embeddings

In [129]:
%time
model_name = "Gradient Boost"
model_save_name = "gradient_boosting_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = GradientBoostingClassifier()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, word2vec_minmax_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 1.67 µs
Logged Gradient Boost model with avg f1 score: 0.9543309475920368
Logged Gradient Boost model with avg f1 std: 0.01457738321229346


## 6. Glove Encoding

In [130]:
# GLOVE_EMBEDDING_PATH = "/kaggle/input/glove6b300dtxt/glove.6B.300d.txt"

### 6.1. Vectorizing text using glove embeddings

In [131]:
# loading pre-trained glove embeddings
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile,'r', encoding="utf8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model

In [132]:
# glove_model = loadGloveModel(GLOVE_EMBEDDING_PATH)

In [133]:
# glove_data = vectorizing_to_glove(X['clean_text'], glove_model)

In [134]:
# shape of train data embeddings
# glove_data.shape

In [135]:
# saving embeddings for data column comment
# save_embeddings(SAVE_PATH, "glove_embeddings.pkl", glove_data)

### 6.2 Loading saved glove embeddings

In [136]:
# saving embeddings for data column comment
glove_data = load_embeddings(SAVE_PATH, "glove_embeddings.pkl")

Loaded from : data/glove_embeddings.pkl


### 6.3 Scaling on Glove Embeddings

In [137]:
# preprocessing data to standardize data with mean 0
std_scaler = StandardScaler()
glove_std_scaled = std_scaler.fit_transform(glove_data)

minmax_scaler = MinMaxScaler()
glove_minmax_scaled = minmax_scaler.fit_transform(glove_data)

### 6.4 Logistic Regression on Glove Embeddings

In [138]:
%time
# Start an experiment (if not already created)
experiment_name = "GloVe_Embeddings"
mlflow.set_experiment(experiment_name)

model_name = "Logistic Regression"
model_save_name = "logistic_regression_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = LogisticRegression(max_iter=2000)

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, glove_std_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

2024/12/27 01:20:49 INFO mlflow.tracking.fluent: Experiment with name 'GloVe_Embeddings' does not exist. Creating a new experiment.


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 8.11 µs
Logged Logistic Regression model with avg f1 score: 0.9718482530789798
Logged Logistic Regression model with avg f1 std: 0.006860856964094403


### 6.5 Naive Bayes on glove embeddings

In [139]:
%time
model_name = "Naive Bayes"
model_save_name = "naive_bayes_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = MultinomialNB()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, glove_minmax_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 4 µs, sys: 3 µs, total: 7 µs
Wall time: 10 µs
Logged Naive Bayes model with avg f1 score: 0.9155249265919974
Logged Naive Bayes model with avg f1 std: 0.014480953723875412


### 6.6 SVC on glove embeddings

In [140]:
%time
model_name = "SVC"
model_save_name = "svc_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = SVC()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, glove_std_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.62 µs
Logged SVC model with avg f1 score: 0.9678993850593406
Logged SVC model with avg f1 std: 0.006180751324692117


### 6.7 Random Forest on glove embeddings

In [141]:
%time
model_name = "Random Forest"
model_save_name = "random_forest_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = RandomForestClassifier()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, glove_std_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs
Logged Random Forest model with avg f1 score: 0.9576199020561529
Logged Random Forest model with avg f1 std: 0.008164490390919038


### 6.8 Gradient Boost on glove embeddings

In [142]:
%time
model_name = "Gradient Boost"
model_save_name = "gradient_boosting_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = GradientBoostingClassifier()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, glove_std_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
Logged Gradient Boost model with avg f1 score: 0.955777765182425
Logged Gradient Boost model with avg f1 std: 0.006830385211894745


## 7. FastText Encoding

In [143]:
# FASTTEXT_EMBEDDING_PATH = "/kaggle/input/fasttext-english-300/cc.en.300.bin"

In [144]:
# import fasttext.util
# fasttext.util.download_model('en', if_exists='ignore')  # English
# ft = fasttext.load_model(FASTTEXT_EMBEDDING_PATH)

In [145]:
# ft.get_dimension()

### 7.1. Vectorizing text using fasttext embeddings

In [146]:
# fasttext_data = vectorizing_to_fasttext(X['clean_text'], ft)

In [147]:
# save_embeddings(SAVE_PATH, 'fasttext_data.pkl', fasttext_data)

In [148]:
# fasttext_data.shape

### 7.2. Loading Saved fasttext embeddings

In [149]:
fasttext_data = load_embeddings(SAVE_PATH, 'fasttext_data.pkl')
fasttext_data.shape

Loaded from : data/fasttext_data.pkl


(1490, 300)

### 7.3 Scaling fasttext embeddings

In [150]:
# preprocessing data to convert to range 0-1
scaler = StandardScaler()
fasttext_std_scaled = scaler.fit_transform(fasttext_data)

# preprocessing data to convert to range 0-1
scaler = MinMaxScaler()
fasttext_minmax_scaled = scaler.fit_transform(fasttext_data)

### 7.4 Logistic Regression on fasttext embeddings

In [151]:
%time
# Start an experiment (if not already created)
experiment_name = "FastText_Embeddings"
mlflow.set_experiment(experiment_name)

model_name = "Logistic Regression"
model_save_name = "logistic_regression_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = LogisticRegression(max_iter=2000)

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, fasttext_std_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

2024/12/27 01:24:46 INFO mlflow.tracking.fluent: Experiment with name 'FastText_Embeddings' does not exist. Creating a new experiment.


CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 12.9 µs
Logged Logistic Regression model with avg f1 score: 0.9624145089184226
Logged Logistic Regression model with avg f1 std: 0.011343718947799557


### 7.5 Naive Bayes on fasttext embeddings

In [152]:
%time
model_name = "Naive Bayes"
model_save_name = "naive_bayes_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = MultinomialNB()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, fasttext_minmax_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs
Logged Naive Bayes model with avg f1 score: 0.7509317153832529
Logged Naive Bayes model with avg f1 std: 0.023732069110960975


### 7.6 SVC on fasttext embeddings

In [153]:
%time
model_name = "SVC"
model_save_name = "svc_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = SVC()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, fasttext_std_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.05 µs
Logged SVC model with avg f1 score: 0.9599365089808403
Logged SVC model with avg f1 std: 0.009565816728661237


### 7.7 Random Forest on fasttext embeddings

In [154]:
%time
model_name = "Random Forest"
model_save_name = "random_forest_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = RandomForestClassifier()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, fasttext_std_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.15 µs
Logged Random Forest model with avg f1 score: 0.9510224096687907
Logged Random Forest model with avg f1 std: 0.009197314985749239


### 7.8 Gradient Boost on fasttext embeddings

In [155]:
%time
model_name = "Gradient Boost"
model_save_name = "gradient_boosting_model"

# Start MLflow run
with mlflow.start_run(run_name=f"{experiment_name} + {model_name}"):

    # initializing model
    classifier = GradientBoostingClassifier()

    # cross validation
    avg_f1_score, avg_f1_std = stratified_k_fold_cv_f1(classifier, fasttext_std_scaled, y)

    # Log metrics (e.g., f1 score, precision, recall, F1-score)
    mlflow.log_metric("avg f1 score", avg_f1_score)
    mlflow.log_metric("avg f1 std", avg_f1_std)
    mlflow.log_param("models", model_name)

    print(f"Logged {model_name} model with avg f1 score: {avg_f1_score}")
    print(f"Logged {model_name} model with avg f1 std: {avg_f1_std}")

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 1.91 µs
Logged Gradient Boost model with avg f1 score: 0.9463989861852939
Logged Gradient Boost model with avg f1 std: 0.00631698464348885


## 8. Final Inference

- Glove with Logistic Regression: **97%** f1 score