In [None]:
import psycopg2
import pandas as pd
import os
import psycopg2
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from dotenv import load_dotenv


load_dotenv()

# Access environment variables
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
TABLE_NAME = os.getenv("TABLE_NAME")



def fetch_data_from_db():
    try:
        
        connection = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            sslmode='require'
        )
        print("Connected to the database successfully!")

        
        query = 'SELECT sentence, "isToxic" FROM vectorize.sentences;'
        df = pd.read_sql_query(query, connection)

        
        connection.close()
        return df

    except Exception as e:
        print(f"Error connecting to the database: {e}")
        return None


df = fetch_data_from_db()
df.head()


Connected to the database successfully!


  df = pd.read_sql_query(query, connection)


Unnamed: 0,sentence,isToxic
0,Epstein and trump were best buds!!! Pedophiles...,True
1,Hang from the ceiling,True
2,Kill yourself,True
3,Go buy a rope,True
4,I love you,False


In [None]:
csv = pd.read_csv('toxicity_en.csv')
#csv.head()
csv = csv.replace('Toxic', True)
csv = csv.replace('Not Toxic', False)
#print(csv.head())
print(csv.info())
csv.rename(columns={'text': 'sentence', 'is_toxic': 'isToxic'}, inplace=True)
print(csv.head())

FileNotFoundError: [Errno 2] No such file or directory: '/content/toxicity_en.csv'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import torch.nn as nn



train_df, valid_df = train_test_split(csv, test_size=0.2, random_state=42)

train_df['isToxic'] = train_df['isToxic'].astype(int)
valid_df['isToxic'] = valid_df['isToxic'].astype(int)

class BERTDataset:
    def __init__(self, texts, labels, max_len=128):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        tokenized_text = self.tokenizer(
            text,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True,
        )
        ids = tokenized_text['input_ids']
        mask = tokenized_text['attention_mask']
        token_type_ids = tokenized_text['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(label, dtype=torch.long)
        }

# Create DataLoaders
train_dataset = BERTDataset(
    texts=train_df['sentence'].values,
    labels=train_df['isToxic'].values
)

valid_dataset = BERTDataset(
    texts=valid_df['sentence'].values,
    labels=valid_df['isToxic'].values
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

class ToxicModel(nn.Module):
    def __init__(self):
        super(ToxicModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased", return_dict=False)
        self.dropout = nn.Dropout(0.2)  
        self.classifier = nn.Linear(768, 2)

    def forward(self, input_ids, token_type_ids, attention_mask, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


def train_model(model, train_loader, valid_loader, epochs=1):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)


    class_weights = torch.tensor([0.7, 1.3]).to(device) 
    criterion = nn.CrossEntropyLoss(weight=class_weights)


    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)  

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        for batch in train_loader:
            optimizer.zero_grad()
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            target = batch['target'].to(device)

            logits = model(ids, token_type_ids, mask)
            loss = criterion(logits, target)
            loss.backward()
            optimizer.step()

            _, predicted = torch.max(logits, dim=1)
            train_correct += (predicted == target).sum().item()
            train_total += target.size(0)
            train_loss += loss.item() * batch['ids'].size(0)

        train_loss = train_loss / len(train_loader.dataset)
        train_accuracy = 100 * train_correct / train_total

        model.eval()
        valid_loss = 0
        valid_correct = 0
        valid_total = 0
        valid_pred = []
        valid_true = []

        with torch.no_grad():
            for batch in valid_loader:
                ids = batch['ids'].to(device)
                mask = batch['mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                target = batch['target'].to(device)

                logits = model(ids, token_type_ids, mask)
                loss = criterion(logits, target)

                _, predicted = torch.max(logits, dim=1)
                valid_correct += (predicted == target).sum().item()
                valid_total += target.size(0)

                valid_loss += loss.item() * batch['ids'].size(0)
                valid_pred.extend(predicted.cpu().numpy())
                valid_true.extend(target.cpu().numpy())

        valid_loss = valid_loss / len(valid_loader.dataset)
        valid_accuracy = 100 * valid_correct / valid_total

        valid_precision = precision_score(valid_true, valid_pred)
        valid_recall = recall_score(valid_true, valid_pred)
        valid_f1 = f1_score(valid_true, valid_pred)

        print(f"Epoch {epoch+1}/{epochs}.. Train loss: {train_loss:.3f}.. Train accuracy: {train_accuracy:.2f}%.. "
              f"Validation loss: {valid_loss:.3f}.. Validation accuracy: {valid_accuracy:.2f}%.. "
              f"Precision: {valid_precision:.2f}%.. Recall: {valid_recall:.2f}%.. F1: {valid_f1:.2f}%")


# Initialize and train the model
model = ToxicModel()

train_model(model, train_loader, valid_loader, epochs=25)

In [None]:
# Assuming the model has already been trained
torch.save(model.state_dict(), 'toxic_model.pth')
print("Model saved to toxic_model.pth")
