In [6]:

import json
import pandas as pd
import yaml
import pickle
import numpy as np
import wandb
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, balanced_accuracy_score, accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec

# Import functions from preprocessing module
sys.path.append('..')
from utils.load_data import load_processed_data



NameError: name 'sys' is not defined

## Load data

In [None]:
df = load_processed_data()
train_data = df["train"]
test_data = df["test"]
dev_data = df["dev"]

X_train, y_train = train_data["lemma"], train_data["label"]
X_test, y_test = test_data["lemma"], test_data["label"]
X_dev, y_dev = dev_data["lemma"], dev_data["label"]

## TF-IDF

In [None]:
# Initialize the TF-IDF Vectorizer
tfidf_params_list = [
    {'max_features': 3000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 3000, 'ngram_range': (1, 2), 'min_df': 5},
    {'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 5000, 'ngram_range': (1, 2), 'min_df': 5},
    {'max_features': 10000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 10000, 'ngram_range': (1, 2), 'min_df': 5}
]

In [None]:
wandb.init(project="online_sexism_detection", name="tfidf_logistic_regression") 

for tfidf_params in tfidf_params_list:
    vectorizer = TfidfVectorizer(**tfidf_params)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

    logistic_model = LogisticRegression(max_iter=500, class_weight='balanced')
    logistic_model.fit(X_resampled, y_resampled)

    y_pred_logistic = logistic_model.predict(X_test_tfidf)



    # Evaluate
    print(f"TF-IDF Params: {tfidf_params}")
    print("Accuracy:", accuracy_score(y_test, y_pred_logistic))
    print(classification_report(y_test, y_pred_logistic))

    wandb.log({
        'tfidf_params': tfidf_params,
        "f1": f1_score(y_test, y_pred_logistic),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_logistic),
        "accuracy": accuracy_score(y_test, y_pred_logistic),
    })

    df_output_logistic.to_csv('logistic_predictions.csv', index=False)

# Finish Weights & Biases session
wandb.finish()


## Word2Vec

In [None]:
word2vec_params_list = [
    {'vector_size': 50, 'window': 5, 'min_count': 1},
    {'vector_size': 100, 'window': 5, 'min_count': 1},
    {'vector_size': 200, 'window': 5, 'min_count': 1},
    {'vector_size': 300, 'window': 5, 'min_count': 1},
    {'vector_size': 300, 'window':10 , 'min_count': 1},
    {'vector_size': 300, 'window':20 , 'min_count': 1}
]

In [None]:

wandb.init(project="online_sexism_detection", name="word2vec_logistic_regression")

for word2vec_params in word2vec_params_list:
    X_processed_train = X_train.apply(lambda x: x.split()).tolist()
    X_processed_test = X_test.apply(lambda x: x.split()).tolist()

    word2vec_model = Word2Vec(sentences=X_processed_train, **word2vec_params)

    def vectorize_sentences(sentences, model):
        vectors = []
        for sentence in sentences:
            word_vectors = [model.wv[word] for word in sentence if word in model.wv]
            if word_vectors:
                vectors.append(np.mean(word_vectors, axis=0))
            else:
                vectors.append(np.zeros(model.vector_size))
        return np.array(vectors)

    X_train_vectors = vectorize_sentences(X_processed_train, word2vec_model)
    X_test_vectors = vectorize_sentences(X_processed_test, word2vec_model)

    smote = SMOTE(random_state=42)
    X_resampled_w2v, y_resampled_w2v = smote.fit_resample(X_train_vectors, y_train)

    logistic_model_w2v = LogisticRegression(max_iter=500, class_weight='balanced')
    logistic_model_w2v.fit(X_resampled_w2v, y_resampled_w2v)


    y_pred_logistic_w2v = logistic_model_w2v.predict(X_test_vectors)
    
    # Evaluate
    print(f"Word2Vec Params: {word2vec_params}")
    print("Accuracy:", accuracy_score(y_test, y_pred_logistic_w2v))
    print(classification_report(y_test , y_pred_logistic_w2v))

    # Log results to Weights & Biases
    wandb.log({
        'word2vec_params': word2vec_params,
        "f1": f1_score(y_test, y_pred_logistic_w2v),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_logistic_w2v),
        "accuracy": accuracy_score(y_test, y_pred_logistic_w2v),
    })

wandb.finish()