In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import yaml



In [2]:

config_path = os.getenv("CONFIG_PATH", "../config.yaml")

# Load configuration from YAML
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

# Create the function to load data 
def load_processed_data(split=None):

    if split is None:
        split = ["train", "dev", "test"]

    split_dataframes = {}
    paths = {
        "train": os.path.join(config['paths']['output_dir'], config["files"]["subsets"]["train"]["parquet"]),
        "dev": os.path.join(config['paths']['output_dir'], config["files"]["subsets"]["dev"]["parquet"]),
        "test": os.path.join(config['paths']['output_dir'], config["files"]["subsets"]["test"]["parquet"])
    }

    for split_type in split:
        split_dataset_path = paths.get(split_type)
        if split_dataset_path and os.path.exists(split_dataset_path):
            split_dataframes[split_type] = pd.read_parquet(split_dataset_path)
            print(f"df: {split_type.capitalize()} split loaded.")
        else:
            print(f"Warning: {split_type} split file not found.")

    return split_dataframes


df_all = load_processed_data()

# All DataFrames:
X_train = df_all['train']['text']
y_train = df_all['train']['label']
X_test = df_all['test']['text']
y_test = df_all['test']['label']

df: Train split loaded.
df: Dev split loaded.
df: Test split loaded.


### TF-IDF with Logistic Regression

In [3]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=5)

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=500, class_weight='balanced')
logistic_model.fit(X_resampled, y_resampled)

# Make predictions with Logistic Regression
y_pred_logistic = logistic_model.predict(X_test_tfidf)
y_pred_proba_logistic = logistic_model.predict_proba(X_test_tfidf)  # Calculate probabilities


# Create the output DataFrame
df_output_logistic = pd.DataFrame({
    'y_true': y_test,
    'y_pred': y_pred_logistic,
    'prob_0': y_pred_proba_logistic[:, 0],  # Probability of class 0 
    'prob_1': y_pred_proba_logistic[:, 1],  # Probability of class 1
    'logit_0': np.nan,  # You don't have logits for traditional models
    'logit_1': np.nan
})

# Evaluate the Logistic Regression model
print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_logistic))
print(classification_report(y_test, y_pred_logistic))

# Save the output DataFrame to a CSV file 
df_output_logistic.to_csv('logistic_predictions.csv', index=False)



Logistic Regression Results:
[[2584  446]
 [ 330  640]]
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      3030
           1       0.59      0.66      0.62       970

    accuracy                           0.81      4000
   macro avg       0.74      0.76      0.75      4000
weighted avg       0.81      0.81      0.81      4000



### TF-IDF with SVM

In [4]:
# Train the SVM model
svm_model = SVC(random_state=42, class_weight='balanced', probability=True)
svm_model.fit(X_resampled, y_resampled)

# Make predictions with SVM
y_pred_svm = svm_model.predict(X_test_tfidf)
y_pred_proba_svm = svm_model.predict_proba(X_test_tfidf)  # Calculate probabilities


# Create the output DataFrame
df_output_svm = pd.DataFrame({
    'y_true': y_test,
    'y_pred': y_pred_svm,
    'prob_0': y_pred_proba_svm[:, 0],  # Probability of class 0 
    'prob_1': y_pred_proba_svm[:, 1],  # Probability of class 1
    'logit_0': np.nan,  # You don't have logits for traditional models
    'logit_1': np.nan
})

# Evaluate the SVM model
print("SVM Results:")
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# Save the output DataFrame to a CSV file 
df_output_svm.to_csv('svm_predictions.csv', index=False)

SVM Results:
[[2952   78]
 [ 623  347]]
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      3030
           1       0.82      0.36      0.50       970

    accuracy                           0.82      4000
   macro avg       0.82      0.67      0.70      4000
weighted avg       0.82      0.82      0.80      4000



### Word2vec

In [5]:

# Tokenize sentences for Word2Vec
X_processed_train = X_train.apply(lambda x: x.split()).tolist()  # Tokenize sentences for training
X_processed_test = X_test.apply(lambda x: x.split()).tolist()     # Tokenize sentences for testing

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X_processed_train, vector_size=100, window=5, min_count=1, workers=4)

# Function to vectorize sentences using Word2Vec
def vectorize_sentences(sentences, model):
    vectors = []
    for sentence in sentences:
        word_vectors = [model.wv[word] for word in sentence if word in model.wv]  # Get word vectors
        if word_vectors:  # If there are any word vectors
            vectors.append(np.mean(word_vectors, axis=0))  # Average the word vectors
        else:
            vectors.append(np.zeros(model.vector_size))  # If no words found, use a zero vector
    return np.array(vectors)

X_train_vectors = vectorize_sentences(X_processed_train, word2vec_model)
X_test_vectors = vectorize_sentences(X_processed_test, word2vec_model)

# Apply SMOTE to the training data 
smote = SMOTE(random_state=42)
X_resampled_w2v, y_resampled_w2v = smote.fit_resample(X_train_vectors, y_train)

# Train the Logistic Regression model with Word2Vec
logistic_model_w2v = LogisticRegression(max_iter=500, class_weight='balanced')
logistic_model_w2v.fit(X_resampled_w2v, y_resampled_w2v)




#### Logistic Regression (W2V)

In [6]:
# Make predictions with Logistic Regression (Word2Vec)
y_pred_logistic_w2v = logistic_model_w2v.predict(X_test_vectors)
y_pred_proba_logistic_w2v = logistic_model_w2v.predict_proba(X_test_vectors) # Get probabilities

# Create the output DataFrame
df_output_logistic_w2v = pd.DataFrame({
    'y_true': y_test,
    'y_pred': y_pred_logistic_w2v,
    'prob_0': y_pred_proba_logistic_w2v[:, 0],  # Probability of class 0 
    'prob_1': y_pred_proba_logistic_w2v[:, 1],  # Probability of class 1
    'logit_0': np.nan,  # You don't have logits for traditional models
    'logit_1': np.nan
    })

# Evaluate the Logistic Regression model (Word2Vec)
print("Logistic Regression Results (Word2Vec):")
print(confusion_matrix(y_test, y_pred_logistic_w2v))
print(classification_report(y_test, y_pred_logistic_w2v))

# Save the output DataFrame to a CSV file
df_output_logistic_w2v.to_csv('logistic_w2v_predictions.csv', index=False)


Logistic Regression Results (Word2Vec):
[[1664 1366]
 [ 364  606]]
              precision    recall  f1-score   support

           0       0.82      0.55      0.66      3030
           1       0.31      0.62      0.41       970

    accuracy                           0.57      4000
   macro avg       0.56      0.59      0.53      4000
weighted avg       0.70      0.57      0.60      4000



#### SVM Prediction (W2V)

In [8]:
# Train the SVM model with Word2Vec
svm_model_w2v = SVC(random_state=42, class_weight='balanced', probability=True)
svm_model_w2v.fit(X_resampled_w2v, y_resampled_w2v)

# Make predictions with SVM (Word2Vec)
y_pred_svm_w2v = svm_model_w2v.predict(X_test_vectors)
y_pred_proba_svm_w2v = svm_model_w2v.decision_function(X_test_vectors) # Get decision function values

# Create the output DataFrame
df_output_svm_w2v = pd.DataFrame({
    'y_true': y_test,
    'y_pred': y_pred_svm_w2v,
    'prob_0': np.nan,  # No probabilities for SVM 
    'prob_1': np.nan,
    'logit_0': y_pred_proba_svm_w2v,  # Use decision function as logits
    'logit_1': -y_pred_proba_svm_w2v  # Negative of the decision function
})


# Evaluate the SVM model (Word2Vec)
print("SVM Results (Word2Vec):")
print(confusion_matrix(y_test, y_pred_svm_w2v))
print(classification_report(y_test, y_pred_svm_w2v))
# Save the output DataFrame to a CSV file
df_output_svm_w2v.to_csv('svm_w2v_predictions.csv', index=False)

SVM Results (Word2Vec):
[[1434 1596]
 [ 284  686]]
              precision    recall  f1-score   support

           0       0.83      0.47      0.60      3030
           1       0.30      0.71      0.42       970

    accuracy                           0.53      4000
   macro avg       0.57      0.59      0.51      4000
weighted avg       0.71      0.53      0.56      4000

