#SVM

In [20]:
import numpy as np
import pandas as pd
import os

import seaborn as sns
import yaml
import wandb

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score, balanced_accuracy_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec

In [21]:
import sys
# Import functions from preprocessing module
sys.path.append('..')
from utils.load_data import load_processed_data

## Load data

In [22]:
df = load_processed_data()
train_data = df["train"]
test_data = df["test"]
dev_data = df["dev"]

X_train, y_train = train_data["lemma"], train_data["label"]
X_test, y_test = test_data["lemma"], test_data["label"]
X_dev, y_dev = dev_data["lemma"], dev_data["label"]

Data loaded


##TF-IDF

In [23]:
tfidf_params_list = [
    {'max_features': 3000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 3000, 'ngram_range': (1, 2), 'min_df': 5},
    {'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 5000, 'ngram_range': (1, 2), 'min_df': 5},
    {'max_features': 10000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 10000, 'ngram_range': (1, 2), 'min_df': 5}
]

In [30]:
wandb.init(project="online_sexism_detection", name="tfidf_svm")
for tfidf_params in tfidf_params_list:
    vectorizer = TfidfVectorizer(**tfidf_params)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

    # Train the SVM model
    svm_model = SVC(random_state=42, class_weight='balanced', probability=True)
    svm_model.fit(X_resampled, y_resampled)

    # Make predictions with SVM
    y_pred_svm = svm_model.predict(X_test_tfidf)

    # Evaluate the SVM model
    print("SVM Results:")
    print(confusion_matrix(y_test, y_pred_svm))
    print(classification_report(y_test, y_pred_svm))

    # Log results
    wandb.log({
        'tfidf_params': tfidf_params,
        "f1": f1_score(y_test, y_pred_svm),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_svm),
        "accuracy": accuracy_score(y_test, y_pred_svm),
    })

wandb.finish()


AttributeError: module 'wandb' has no attribute 'init'