In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [49]:
from pathlib import Path

import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
class Config:
    BASE_DIR = Path('/content/drive/MyDrive/College/machine_learning_2/submissions/1')
    DATA_DIR = BASE_DIR / 'datasets'
    MODEL_DIR = BASE_DIR / 'models'
    RESULT_DIR = BASE_DIR / 'results'

In [4]:
embedding_types = ['bow', 'tf_idf', 'word2vec', 'doc2vec', 'glove', 'fasttext']
feat_embedding_path = Config.DATA_DIR / 'feature_embeddings'

In [35]:
for embedding in embedding_types:
    globals()[f'X_train_{embedding}'] = pd.read_csv(feat_embedding_path / f'X_train_{embedding}.csv', index_col='Unnamed: 0')
    globals()[f'X_valid_{embedding}'] = pd.read_csv(feat_embedding_path / f'X_valid_{embedding}.csv', index_col='Unnamed: 0')
    globals()[f'X_test_{embedding}'] = pd.read_csv(feat_embedding_path / f'X_test_{embedding}.csv', index_col='Unnamed: 0')

y_train = pd.read_csv(feat_embedding_path / 'y_train.csv', index_col='Unnamed: 0').values.ravel()
y_valid = pd.read_csv(feat_embedding_path / 'y_valid.csv', index_col='Unnamed: 0').values.ravel()
y_test = pd.read_csv(feat_embedding_path / 'y_test.csv', index_col='Unnamed: 0').values.ravel()

In [36]:
X_valid_fasttext.head()

Unnamed: 0,ft_0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,...,ft_290,ft_291,ft_292,ft_293,ft_294,ft_295,ft_296,ft_297,ft_298,ft_299
0,0.0266,-0.058274,-0.03715,-0.008496,-0.043268,0.022806,-0.02719,-0.013126,0.015388,-0.029606,...,0.049044,0.008006,-0.138068,0.070202,0.013148,0.040256,0.012674,0.154632,-0.038404,-0.00635
1,0.003783,-0.050405,-0.007295,0.026546,-0.057042,-0.006896,-0.006758,-0.00709,0.001566,-0.014961,...,0.089443,0.014989,-0.120021,0.027724,-0.011859,0.012658,0.021848,0.152754,-0.042313,-0.014981
2,0.013212,-0.019749,0.004684,0.040349,-0.037132,-0.002997,-0.017274,-0.025137,0.036647,-0.011,...,0.049363,0.008604,-0.10798,0.021079,-0.003386,0.00337,0.004006,0.147433,-0.024507,0.009397
3,0.003111,-0.047756,0.011943,0.028673,-0.044645,0.000664,-0.029045,-0.004196,0.005563,-0.024541,...,0.047063,0.008573,-0.15596,0.018957,-0.010385,0.030238,0.028034,0.151245,-0.054295,-0.023097
4,-0.004656,-0.010687,0.007601,0.030204,-0.061191,-0.024184,-0.007868,-0.003525,0.000835,-0.008185,...,0.044986,-0.008437,-0.114051,0.006132,-0.006885,0.004187,0.0142,0.142946,-0.028651,0.002034


In [37]:
print(y_train)

[7 4 4 ... 3 1 8]


In [38]:
# Function to train and evaluate models
def train_and_evaluate(X_train, X_valid, y_train, y_valid):
    # Initialize models
    models = {
        # 'Multinomial Naive Bayes': MultinomialNB(),
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'SVM': SVC(kernel='linear'),
        'Decision Tree': DecisionTreeClassifier(max_depth=10)
    }

    # Results dictionary
    results = {
        'Model': [],
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1 Score': []
    }

    # Iterate over models
    for model_name, model in models.items():
        print(f'started training for {model_name}')
        # Train the model
        model.fit(X_train, y_train)

        # Predict on the validation set
        y_pred_valid = model.predict(X_valid)

        # Evaluate the model
        results['Model'].append(model_name)
        results['Accuracy'].append(accuracy_score(y_valid, y_pred_valid))
        results['Precision'].append(precision_score(y_valid, y_pred_valid, average='weighted'))
        results['Recall'].append(recall_score(y_valid, y_pred_valid, average='weighted'))
        results['F1 Score'].append(f1_score(y_valid, y_pred_valid, average='weighted'))

        print(f'Finished training for {model_name}')

    # Create a DataFrame to store the results
    results_df = pd.DataFrame(results)
    return results_df

In [39]:
all_results = pd.DataFrame()

print("Starting training for all embedding types...\n")

# Iterate through each embedding type and train/evaluate models
for embedding in embedding_types:
    # Fetch the corresponding X_train, X_valid, X_test
    X_train = globals()[f'X_train_{embedding}']
    X_valid = globals()[f'X_valid_{embedding}']
    X_test = globals()[f'X_test_{embedding}']

    print(f"Training models for {embedding} embedding...")

    # Train and evaluate models on this embedding type
    embedding_results = train_and_evaluate(X_train, X_valid, y_train, y_valid)

    print(f"Finished training for {embedding} embedding.\n")

    # Add the embedding type as a column in the results
    embedding_results['Embedding'] = embedding

    # Append the results for this embedding type to the main results dataframe
    all_results = pd.concat([all_results, embedding_results], ignore_index=True)

print("Training complete for all embedding types!")

Starting training for all embedding types...

Training models for bow embedding...
started training for Logistic Regression
Finished training for Logistic Regression
started training for SVM
Finished training for SVM
started training for Decision Tree
Finished training for Decision Tree
Finished training for bow embedding.

Training models for tf_idf embedding...
started training for Logistic Regression
Finished training for Logistic Regression
started training for SVM
Finished training for SVM
started training for Decision Tree
Finished training for Decision Tree
Finished training for tf_idf embedding.

Training models for word2vec embedding...
started training for Logistic Regression
Finished training for Logistic Regression
started training for SVM
Finished training for SVM
started training for Decision Tree
Finished training for Decision Tree
Finished training for word2vec embedding.

Training models for doc2vec embedding...
started training for Logistic Regression
Finished trainin

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Finished training for Logistic Regression
started training for SVM
Finished training for SVM
started training for Decision Tree
Finished training for Decision Tree
Finished training for fasttext embedding.

Training complete for all embedding types!


In [40]:
all_results.head()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Embedding
0,Logistic Regression,0.685608,0.691856,0.685608,0.686694,bow
1,SVM,0.46495,0.554272,0.46495,0.474234,bow
2,Decision Tree,0.512746,0.52105,0.512746,0.515106,bow
3,Logistic Regression,0.744822,0.74875,0.744822,0.74384,tf_idf
4,SVM,0.749071,0.760012,0.749071,0.750662,tf_idf


In [41]:
all_results.shape

(18, 6)

In [47]:
all_results = all_results.sort_values(by='F1 Score', ascending=False)
all_results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Embedding
4,SVM,0.749071,0.760012,0.749071,0.750662,tf_idf
3,Logistic Regression,0.744822,0.74875,0.744822,0.74384,tf_idf
7,SVM,0.69145,0.69395,0.69145,0.686917,word2vec
0,Logistic Regression,0.685608,0.691856,0.685608,0.686694,bow
6,Logistic Regression,0.668879,0.660275,0.668879,0.657294,word2vec
12,Logistic Regression,0.649495,0.64423,0.649495,0.643579,glove
9,Logistic Regression,0.642592,0.640437,0.642592,0.640184,doc2vec
10,SVM,0.642857,0.647161,0.642857,0.63722,doc2vec
16,SVM,0.632767,0.632747,0.632767,0.623904,fasttext
13,SVM,0.613914,0.607661,0.613914,0.605152,glove


## Training Multinomial Naive Bayes
This model only accepts positive values but embeddings like glove, word2vec, doc2vec and fasttext generate negative values too so I will train this model normally on bow and tf_idf which only contain non-negative value and scale the other embeddings and then train the model on those.

In [44]:
nb_embeddings = ['bow', 'tf_idf']
nb_results = []

print("\nStarting Multinomial Naive Bayes training on BoW and TF-IDF...\n")

for embedding in nb_embeddings:
    if f'X_train_{embedding}' in globals():  # Ensure embedding exists
        X_train = globals()[f'X_train_{embedding}']
        X_valid = globals()[f'X_valid_{embedding}']

        print(f"Training MultinomialNB on {embedding}...")

        # Initialize and train MultinomialNB
        nb_model = MultinomialNB()
        nb_model.fit(X_train, y_train)

        # Predictions
        y_pred_valid = nb_model.predict(X_valid)

        # Evaluate
        nb_results.append({
            'Model': 'Multinomial Naive Bayes',
            'Embedding': embedding,
            'Accuracy': accuracy_score(y_valid, y_pred_valid),
            'Precision': precision_score(y_valid, y_pred_valid, average='weighted'),
            'Recall': recall_score(y_valid, y_pred_valid, average='weighted'),
            'F1 Score': f1_score(y_valid, y_pred_valid, average='weighted')
        })

        print(f"Finished training MultinomialNB on {embedding}.\n")


Starting Multinomial Naive Bayes training on BoW and TF-IDF...

Training MultinomialNB on bow...
Finished training MultinomialNB on bow.

Training MultinomialNB on tf_idf...
Finished training MultinomialNB on tf_idf.



In [46]:
# Convert results to a dataframe
nb_results_df = pd.DataFrame(nb_results)
nb_results_df

Unnamed: 0,Model,Embedding,Accuracy,Precision,Recall,F1 Score
0,Multinomial Naive Bayes,bow,0.712161,0.7253,0.712161,0.71284
1,Multinomial Naive Bayes,tf_idf,0.740308,0.755364,0.740308,0.73448


In [48]:
# Append to all results
all_results = pd.concat([all_results, nb_results_df], ignore_index=True)
all_results = all_results.sort_values(by='F1 Score', ascending=False)
all_results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Embedding
0,SVM,0.749071,0.760012,0.749071,0.750662,tf_idf
1,Logistic Regression,0.744822,0.74875,0.744822,0.74384,tf_idf
19,Multinomial Naive Bayes,0.740308,0.755364,0.740308,0.73448,tf_idf
18,Multinomial Naive Bayes,0.712161,0.7253,0.712161,0.71284,bow
2,SVM,0.69145,0.69395,0.69145,0.686917,word2vec
3,Logistic Regression,0.685608,0.691856,0.685608,0.686694,bow
4,Logistic Regression,0.668879,0.660275,0.668879,0.657294,word2vec
5,Logistic Regression,0.649495,0.64423,0.649495,0.643579,glove
6,Logistic Regression,0.642592,0.640437,0.642592,0.640184,doc2vec
7,SVM,0.642857,0.647161,0.642857,0.63722,doc2vec


### Training Naive Bayes on the other embeddings after scaling them

In [50]:
other_embeddings = [embedding for embedding in embedding_types if embedding not in ['bow', 'tf_idf']]
nb_scaled_results = []

print("\nStarting Multinomial Naive Bayes training on scaled embeddings...\n")

for embedding in other_embeddings:
    X_train = globals()[f'X_train_{embedding}']
    X_valid = globals()[f'X_valid_{embedding}']

    print(f"Scaling and training MultinomialNB on {embedding}...")

    # Scale embeddings to 0-1
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)

    # Initialize and train MultinomialNB
    nb_model = MultinomialNB()
    nb_model.fit(X_train_scaled, y_train)

    # Predictions
    y_pred_valid = nb_model.predict(X_valid_scaled)

    # Evaluate
    nb_scaled_results.append({
        'Model': 'Multinomial Naive Bayes',
        'Embedding': f"{embedding}_scaled",  # Mark as scaled
        'Accuracy': accuracy_score(y_valid, y_pred_valid),
        'Precision': precision_score(y_valid, y_pred_valid, average='weighted'),
        'Recall': recall_score(y_valid, y_pred_valid, average='weighted'),
        'F1 Score': f1_score(y_valid, y_pred_valid, average='weighted')
    })

    print(f"Finished training MultinomialNB on {embedding}_scaled.\n")


Starting Multinomial Naive Bayes training on scaled embeddings...

Scaling and training MultinomialNB on word2vec...
Finished training MultinomialNB on word2vec_scaled.

Scaling and training MultinomialNB on doc2vec...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Finished training MultinomialNB on doc2vec_scaled.

Scaling and training MultinomialNB on glove...
Finished training MultinomialNB on glove_scaled.

Scaling and training MultinomialNB on fasttext...
Finished training MultinomialNB on fasttext_scaled.



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
nb_scaled_results_df = pd.DataFrame(nb_scaled_results)
nb_scaled_results_df

Unnamed: 0,Model,Embedding,Accuracy,Precision,Recall,F1 Score
0,Multinomial Naive Bayes,word2vec_scaled,0.542485,0.55063,0.542485,0.52147
1,Multinomial Naive Bayes,doc2vec_scaled,0.499469,0.560541,0.499469,0.469285
2,Multinomial Naive Bayes,glove_scaled,0.448221,0.488704,0.448221,0.415881
3,Multinomial Naive Bayes,fasttext_scaled,0.366171,0.400454,0.366171,0.342037


In [52]:
all_results = pd.concat([all_results, nb_scaled_results_df], ignore_index=True)
all_results = all_results.sort_values(by='F1 Score', ascending=False)
all_results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Embedding
0,SVM,0.749071,0.760012,0.749071,0.750662,tf_idf
1,Logistic Regression,0.744822,0.74875,0.744822,0.74384,tf_idf
2,Multinomial Naive Bayes,0.740308,0.755364,0.740308,0.73448,tf_idf
3,Multinomial Naive Bayes,0.712161,0.7253,0.712161,0.71284,bow
4,SVM,0.69145,0.69395,0.69145,0.686917,word2vec
5,Logistic Regression,0.685608,0.691856,0.685608,0.686694,bow
6,Logistic Regression,0.668879,0.660275,0.668879,0.657294,word2vec
7,Logistic Regression,0.649495,0.64423,0.649495,0.643579,glove
8,Logistic Regression,0.642592,0.640437,0.642592,0.640184,doc2vec
9,SVM,0.642857,0.647161,0.642857,0.63722,doc2vec


In [53]:
all_results.to_csv(Config.RESULT_DIR / 'results.csv')