In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load the training dataset
train_data = pd.read_parquet('/workspaces/online_sexism_detection/data_submission/train_dataset.parquet')

# Load the testing dataset
test_data = pd.read_parquet('/workspaces/online_sexism_detection/data_submission/test_dataset.parquet')

# Define features and labels
X_train = train_data['text']  # Text data for training
y_train = train_data['label']  # Labels for training
X_test = test_data['text']     # Text data for testing
y_test = test_data['label']    # Labels for testing

### TF-IDF with Logistic Regression

In [5]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=5)

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=500, class_weight='balanced')
logistic_model.fit(X_resampled, y_resampled)

# Make predictions with Logistic Regression
y_pred_logistic = logistic_model.predict(X_test_tfidf)

# Evaluate the Logistic Regression model
print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_logistic))
print(classification_report(y_test, y_pred_logistic))

Logistic Regression Results:
[[2588  442]
 [ 331  639]]
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      3030
           1       0.59      0.66      0.62       970

    accuracy                           0.81      4000
   macro avg       0.74      0.76      0.75      4000
weighted avg       0.81      0.81      0.81      4000



### TF-IDF with SVM

In [6]:
# Train the SVM model
svm_model = SVC(random_state=42, class_weight='balanced')
svm_model.fit(X_resampled, y_resampled)

# Make predictions with SVM
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate the SVM model
print("SVM Results:")
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM Results:
[[2953   77]
 [ 624  346]]
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      3030
           1       0.82      0.36      0.50       970

    accuracy                           0.82      4000
   macro avg       0.82      0.67      0.70      4000
weighted avg       0.82      0.82      0.80      4000



### Word2vec

In [7]:
# --- Word2Vec Feature Extraction ---
# Tokenize sentences for Word2Vec
X_processed_train = X_train.apply(lambda x: x.split()).tolist()  # Tokenize sentences for training
X_processed_test = X_test.apply(lambda x: x.split()).tolist()     # Tokenize sentences for testing

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X_processed_train, vector_size=100, window=5, min_count=1, workers=4)

# Function to vectorize sentences using Word2Vec
def vectorize_sentences(sentences, model):
    vectors = []
    for sentence in sentences:
        word_vectors = [model.wv[word] for word in sentence if word in model.wv]  # Get word vectors
        if word_vectors:  # If there are any word vectors
            vectors.append(np.mean(word_vectors, axis=0))  # Average the word vectors
        else:
            vectors.append(np.zeros(model.vector_size))  # If no words found, use a zero vector
    return np.array(vectors)

# Vectorize training and testing data using Word2Vec
X_train_vectors = vectorize_sentences(X_processed_train, word2vec_model)
X_test_vectors = vectorize_sentences(X_processed_test, word2vec_model)

# Apply SMOTE to the training data (Word2Vec)
smote = SMOTE(random_state=42)
X_resampled_w2v, y_resampled_w2v = smote.fit_resample(X_train_vectors, y_train)

# Train the Logistic Regression model with Word2Vec
logistic_model_w2v = LogisticRegression(max_iter=500, class_weight='balanced')
logistic_model_w2v.fit(X_resampled_w2v, y_resampled_w2v)

# Make predictions with Logistic Regression (Word2Vec)
y_pred_logistic_w2v = logistic_model_w2v.predict(X_test_vectors)

# Evaluate the Logistic Regression model (Word2Vec)
print("Logistic Regression Results (Word2Vec):")
print(confusion_matrix(y_test, y_pred_logistic_w2v))
print(classification_report(y_test, y_pred_logistic_w2v))

# Train the SVM model with Word2Vec
svm_model_w2v = SVC(random_state=42, class_weight='balanced')
svm_model_w2v.fit(X_resampled_w2v, y_resampled_w2v)

# Make predictions with SVM (Word2Vec)
y_pred_svm_w2v = svm_model_w2v.predict(X_test_vectors)

# Evaluate the SVM model (Word2Vec)
print("SVM Results (Word2Vec):")
print(confusion_matrix(y_test, y_pred_svm_w2v))
print(classification_report(y_test, y_pred_svm_w2v))

Logistic Regression Results (Word2Vec):
[[1612 1418]
 [ 361  609]]
              precision    recall  f1-score   support

           0       0.82      0.53      0.64      3030
           1       0.30      0.63      0.41       970

    accuracy                           0.56      4000
   macro avg       0.56      0.58      0.53      4000
weighted avg       0.69      0.56      0.59      4000

SVM Results (Word2Vec):
[[1349 1681]
 [ 266  704]]
              precision    recall  f1-score   support

           0       0.84      0.45      0.58      3030
           1       0.30      0.73      0.42       970

    accuracy                           0.51      4000
   macro avg       0.57      0.59      0.50      4000
weighted avg       0.70      0.51      0.54      4000

