In [None]:
!pip install -U sentence-transformers -q

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import spacy
import string

np.random.seed(2022)

# Load the Sentence Transformers model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load a subset of the dataset to reduce computation
data = pd.read_csv("/content/train.csv.zip").sample(frac=0.1, random_state=42)  # Update 'path_to_train.csv' and adjust the fraction as needed

# Define tokenizer function
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

def spacy_tokenizer(sentence):
    doc = nlp(sentence)
    mytokens = [word.lemma_.lower().strip() for word in doc]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    return " ".join(mytokens)

# Apply tokenizer to comment_text column
data['tokenize'] = data['comment_text'].apply(spacy_tokenizer)

# Encode tokenized text using Sentence Transformers
data['embeddings'] = data['tokenize'].apply(model.encode)

# Prepare X (features) and y (target)
X = data['embeddings'].to_list()
y = data['toxic'].to_list()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Train a Logistic Regression model
LR = LogisticRegression()
LR.fit(X_train, y_train)

# Make predictions on the test set
predicted = LR.predict(X_test)

# Evaluate the model
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:", metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted))