In [None]:
import psycopg2
import pandas as pd

DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
TABLE_NAME = os.getenv("TABLE_NAME")

def fetch_data_from_db():
    try:
        # Connect to the database
        connection = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            sslmode='require'
        )
        print("Connected to the database successfully!")

        # Query data
        query = 'SELECT sentence, "isToxic" FROM vectorize.sentences;'
        df = pd.read_sql_query(query, connection)

        # Close the connection
        connection.close()
        return df

    except Exception as e:
        print(f"Error connecting to the database: {e}")
        return None

# Fetch data
df = fetch_data_from_db()
df.head()

Connected to the database successfully!


  df = pd.read_sql_query(query, connection)


Unnamed: 0,sentence,isToxic
0,Epstein and trump were best buds!!! Pedophiles...,True
1,Hang from the ceiling,True
2,Kill yourself,True
3,Go buy a rope,True
4,I love you,False


In [3]:
csv = pd.read_csv('toxicity_en.csv')
#csv.head()
csv = csv.replace('Toxic', True)
csv = csv.replace('Not Toxic', False)
#print(csv.head())
print(csv.info())
csv.rename(columns={'text': 'sentence', 'is_toxic': 'isToxic'}, inplace=True)
print(csv.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      1000 non-null   object
 1   is_toxic  1000 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 8.9+ KB
None
                                            sentence  isToxic
0  Elon Musk is a piece of shit, greedy capitalis...     True
1  The senile credit card shrill from Delaware ne...     True
2  He does that a lot -- makes everyone look good...     True
3                                         F*ck Lizzo     True
4  Epstein and trump were best buds!!! Pedophiles...     True


  csv = csv.replace('Not Toxic', False)


In [None]:
from sklearn.model_selection import train_test_split

dataset = pd.concat([df, csv])
dataset['isToxic'] = dataset['isToxic'].astype(int)
print(dataset.info())
print(dataset.head())

# Split the data into train and validation sets
train_df, valid_df = train_test_split(dataset, test_size=0.2, random_state=42)



<class 'pandas.core.frame.DataFrame'>
Index: 1211 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  1211 non-null   object
 1   isToxic   1211 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 28.4+ KB
None
                                            sentence  isToxic
0  Epstein and trump were best buds!!! Pedophiles...        1
1                              Hang from the ceiling        1
2                                      Kill yourself        1
3                                      Go buy a rope        1
4                                         I love you        0


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F
from sklearn.metrics import f1_score, precision_score, recall_score


# Preprocess the data: tokenize and prepare it for GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a padding token to the tokenizer (GPT2 does not have a padding token by default)
tokenizer.pad_token = tokenizer.eos_token

# Convert text into token IDs and truncate/pad accordingly
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True, max_length=128)

# Define the custom dataset class
class ToxicityDataset(Dataset):
    def __init__(self, df):
        self.texts = df['sentence'].tolist()
        self.labels = df['isToxic'].tolist()
        self.encodings = tokenizer(self.texts, truncation=True, padding=True, max_length=128)


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create custom dataset and DataLoader
train_dataset = ToxicityDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Load GPT-2 model with a classification head
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)

# Ensure the model's pad token is set correctly
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
def train():
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Calculate training accuracy
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_correct += (predictions == labels).sum().item()
        total_samples += labels.size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = total_correct / total_samples

    # Print metrics
    print(f"Training Loss: {avg_loss}")
    print(f"Training Accuracy: {accuracy}")
    model.train()
   
valid_dataset = ToxicityDataset(valid_df)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

# Modify the evaluation loop to print validation loss and F1 score
def evaluate():
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in valid_loader:  # Use validation loader here
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)

            # Store predictions and labels for metric calculations
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    avg_loss = total_loss / len(valid_loader)
    accuracy = total_correct / total_samples
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    # Print metrics
    print(f"Validation Loss: {avg_loss}")
    print(f"Validation Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
# Training and evaluation
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train()
    evaluate()

# Save the trained model
model.save_pretrained("gpt2_toxic_classifier_custom")
tokenizer.save_pretrained("gpt2_toxic_classifier_custom")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


In [34]:
def predict(model, tokenizer, sentence, device):
    """
    Predict whether the input sentence is toxic or not using the trained model.
    
    Parameters:
    - model: The trained GPT-2 model for sequence classification.
    - tokenizer: The tokenizer used to tokenize the input text.
    - sentence: The input sentence to classify.
    - device: The device (CPU or GPU) to run the model on.

    Returns:
    - A string indicating whether the sentence is 'Toxic' or 'Not Toxic'.
    """
    
    # Preprocess the sentence (tokenize it)
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    
    # Move input tensors to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Set the model to evaluation mode
    model.eval()

    # Disable gradients for inference
    with torch.no_grad():
        # Pass the input through the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get the predicted class (0 or 1)
        prediction = torch.argmax(logits, dim=-1).item()

    # Convert the prediction to a label (0 -> 'Not Toxic', 1 -> 'Toxic')
    return prediction

# Example usage:
# Assuming your model and tokenizer are already loaded and on the correct device (GPU/CPU)
# Replace these with actual model and tokenizer, and ensure they're loaded on the correct device (CPU/GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentence = "I love you"

print(type(valid_df))

correct_pred = 0
for index, row in valid_df.iterrows():
    sentence = row['sentence']

    # Call the predict function
    result = predict(model, tokenizer, sentence, device)
    if row['isToxic'] == result:
        correct_pred += 1

print(correct_pred)
print(len(valid_df))
print(float(correct_pred)/float(len(valid_df)))

<class 'pandas.core.frame.DataFrame'>
224
243
0.9218106995884774
