#SVM

In [15]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import yaml
import wandb

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score, balanced_accuracy_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec

AttributeError: module 'numpy' has no attribute 'round'

import sys
# Import functions from preprocessing module
sys.path.append('..')
from utils.load_data import load_processed_data

In [None]:
import sys
import os

# Add the absolute path to the utils directory
sys.path.append('/Users/mac/Downloads/online_sexism_detection/utils')

# Now try importing the function
try:
    from load_data import load_processed_data
    print("Import successful!")
except ModuleNotFoundError as e:
    print("Error importing module:", e)

# Define the paths to your parquet files
train_parquet = '/Users/mac/Downloads/online_sexism_detection/data/train.parquet'
dev_parquet = '/Users/mac/Downloads/online_sexism_detection/data/dev.parquet'
test_parquet = '/Users/mac/Downloads/online_sexism_detection/data/test.parquet'

# Use globals() to make these variables accessible in load_data.py
globals()['train_parquet'] = train_parquet
globals()['dev_parquet'] = dev_parquet
globals()['test_parquet'] = test_parquet

# Now call the function
data = load_processed_data()

Import successful!
Data loaded


## Load data

In [None]:
df = load_processed_data()
train_data = df["train"]
test_data = df["test"]
dev_data = df["dev"]

X_train, y_train = train_data["lemma"], train_data["label"]
X_test, y_test = test_data["lemma"], test_data["label"]
X_dev, y_dev = dev_data["lemma"], dev_data["label"]

Data loaded


##TF-IDF

In [None]:
tfidf_params_list = [
    {'max_features': 3000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 3000, 'ngram_range': (1, 2), 'min_df': 5},
    {'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 5000, 'ngram_range': (1, 2), 'min_df': 5},
    {'max_features': 10000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 10000, 'ngram_range': (1, 2), 'min_df': 5}
]

In [None]:
wandb.init(project="online_sexism_detection", name="tfidf_svm")
for tfidf_params in tfidf_params_list:
    vectorizer = TfidfVectorizer(**tfidf_params)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

    # Train the SVM model
    svm_model = SVC(random_state=42, class_weight='balanced', probability=True)
    svm_model.fit(X_resampled, y_resampled)

    # Make predictions with SVM
    y_pred_svm = svm_model.predict(X_test_tfidf)

    # Evaluate the SVM model
    print("SVM Results:")
    print(confusion_matrix(y_test, y_pred_svm))
    print(classification_report(y_test, y_pred_svm))

    # Log results
    wandb.log({
        'tfidf_params': tfidf_params,
        "f1": f1_score(y_test, y_pred_svm),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_svm),
        "accuracy": accuracy_score(y_test, y_pred_svm),
    })

wandb.finish()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33me12409115[0m ([33monline_sexism_detection[0m). Use [1m`wandb login --relogin`[0m to force relogin




SVM Results:
[[2965   65]
 [ 600  370]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      3030
           1       0.85      0.38      0.53       970

    accuracy                           0.83      4000
   macro avg       0.84      0.68      0.71      4000
weighted avg       0.84      0.83      0.81      4000





SVM Results:
[[2948   82]
 [ 574  396]]
              precision    recall  f1-score   support

           0       0.84      0.97      0.90      3030
           1       0.83      0.41      0.55       970

    accuracy                           0.84      4000
   macro avg       0.83      0.69      0.72      4000
weighted avg       0.83      0.84      0.81      4000





SVM Results:
[[2965   65]
 [ 607  363]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      3030
           1       0.85      0.37      0.52       970

    accuracy                           0.83      4000
   macro avg       0.84      0.68      0.71      4000
weighted avg       0.83      0.83      0.81      4000





SVM Results:
[[2963   67]
 [ 588  382]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      3030
           1       0.85      0.39      0.54       970

    accuracy                           0.84      4000
   macro avg       0.84      0.69      0.72      4000
weighted avg       0.84      0.84      0.81      4000





SVM Results:
[[2965   65]
 [ 607  363]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      3030
           1       0.85      0.37      0.52       970

    accuracy                           0.83      4000
   macro avg       0.84      0.68      0.71      4000
weighted avg       0.83      0.83      0.81      4000





SVM Results:
[[2963   67]
 [ 598  372]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      3030
           1       0.85      0.38      0.53       970

    accuracy                           0.83      4000
   macro avg       0.84      0.68      0.71      4000
weighted avg       0.84      0.83      0.81      4000



0,1
accuracy,▄█▁█▁▄
balanced_accuracy,▃█▁▆▁▃
f1,▃█▁▆▁▃

0,1
accuracy,0.83375
balanced_accuracy,0.6807
f1,0.52803


## Word2vec

In [None]:
word2vec_params_list = [
    {'vector_size': 200, 'window': 5, 'min_count': 1},
    {'vector_size': 300, 'window': 5, 'min_count': 1},
    {'vector_size': 300, 'window': 10, 'min_count': 1},
    {'vector_size': 300, 'window': 20, 'min_count': 1}
]

In [None]:
wandb.init(project="online_sexism_detection", name=" word2vec_svm")

for word2vec_params in word2vec_params_list:
    X_processed_train = X_train.apply(lambda x: x.split()).tolist()
    X_processed_test = X_test.apply(lambda x: x.split()).tolist()

    word2vec_model = Word2Vec(sentences=X_processed_train, **word2vec_params)

    def vectorize_sentences(sentences, model):
        vectors = []
        for sentence in sentences:
            word_vectors = [model.wv[word] for word in sentence if word in model.wv]
            if word_vectors:
                vectors.append(np.mean(word_vectors, axis=0))
            else:
                vectors.append(np.zeros(model.vector_size))
        return np.array(vectors)

    X_train_vectors = vectorize_sentences(X_processed_train, word2vec_model)
    X_test_vectors = vectorize_sentences(X_processed_test, word2vec_model)

    smote = SMOTE(random_state=42)
    X_resampled_w2v, y_resampled_w2v = smote.fit_resample(X_train_vectors, y_train)

    # Train the SVM model with Word2Vec
    svm_model_w2v = SVC(random_state=42, class_weight='balanced', probability=True)
    svm_model_w2v.fit(X_resampled_w2v, y_resampled_w2v)

    # Make predictions with SVM (Word2Vec)
    y_pred_svm_w2v = svm_model_w2v.predict(X_test_vectors)
    
    # Evaluate
    print(f"Word2Vec Params: {word2vec_params}")
    print("Accuracy:", accuracy_score(y_test, y_pred_svm_w2v))
    print(classification_report(y_test, y_pred_svm_w2v))

    # Log results to Weights & Biases
    wandb.log({
        'word2vec_params': word2vec_params,
        "f1": f1_score(y_test, y_pred_svm_w2v),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_svm_w2v),
        "accuracy": accuracy_score(y_test, y_pred_svm_w2v),
    })

wandb.finish()



Word2Vec Params: {'vector_size': 50, 'window': 5, 'min_count': 1}
Accuracy: 0.24275
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      3030
           1       0.24      1.00      0.39       970

    accuracy                           0.24      4000
   macro avg       0.62      0.50      0.20      4000
weighted avg       0.82      0.24      0.10      4000





Word2Vec Params: {'vector_size': 100, 'window': 5, 'min_count': 1}
Accuracy: 0.24275
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      3030
           1       0.24      1.00      0.39       970

    accuracy                           0.24      4000
   macro avg       0.62      0.50      0.20      4000
weighted avg       0.82      0.24      0.10      4000





Word2Vec Params: {'vector_size': 200, 'window': 5, 'min_count': 1}
Accuracy: 0.24275
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      3030
           1       0.24      1.00      0.39       970

    accuracy                           0.24      4000
   macro avg       0.62      0.50      0.20      4000
weighted avg       0.82      0.24      0.10      4000





Word2Vec Params: {'vector_size': 300, 'window': 5, 'min_count': 1}
Accuracy: 0.24275
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      3030
           1       0.24      1.00      0.39       970

    accuracy                           0.24      4000
   macro avg       0.62      0.50      0.20      4000
weighted avg       0.82      0.24      0.10      4000





Word2Vec Params: {'vector_size': 300, 'window': 10, 'min_count': 1}
Accuracy: 0.24275
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      3030
           1       0.24      1.00      0.39       970

    accuracy                           0.24      4000
   macro avg       0.62      0.50      0.20      4000
weighted avg       0.82      0.24      0.10      4000





Word2Vec Params: {'vector_size': 300, 'window': 20, 'min_count': 1}
Accuracy: 0.24275
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      3030
           1       0.24      1.00      0.39       970

    accuracy                           0.24      4000
   macro avg       0.62      0.50      0.20      4000
weighted avg       0.82      0.24      0.10      4000



0,1
accuracy,▁▁▁▁▁▁
balanced_accuracy,▁▁▁▁▁▁
f1,▁▁▁▁▁▁

0,1
accuracy,0.24275
balanced_accuracy,0.50017
f1,0.39042
