In [1]:
import pandas as pd
import random

# Load the dataset into a pandas dataframe
df = pd.read_excel('tr_Detecting Multilingual Offensive Language in Social Media.xlsx', names=['Text', 'Label'])
# Print the first few rows of the resulting dataframe
print(df.head())

                                                Text  Label
0  @USER en güzel uyuyan insan ödülü jeon jungkoo...      0
1  @USER Mekanı cennet olsun, saygılar sayın avuk...      0
2  Kızlar aranızda kas yığını beylere düşenler ol...      0
3  Biraz ders çalışayım. Tembellik ve uyku düşman...      0
4  @USER Trezeguet yerine El Sharawy daha iyi olm...      0


In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
# Import necessary libraries
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

# Drop rows with missing values in the 'Text' column
df['Text'].dropna(inplace=True)

# Convert text to string data type
df['Text'] = df['Text'].apply(str)

# Define preprocessing functions
def normalize(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words('turkish'))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

def stem(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

def preprocess(text):
    text = normalize(text)
    text = remove_stopwords(text)
    text = stem(text)
    return text

# Preprocess text
df["Text"] = df["Text"].apply(preprocess)
# Split the DataFrame into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)



  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAGHADBIRECILI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAGHADBIRECILI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
from transformers import GPT2Config

model_config = GPT2Config.from_pretrained("gpt2")
model_config.pad_token_id = tokenizer.pad_token_id
from transformers import GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=model_config)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model_config.pad_token_id = tokenizer.pad_token_id

# Tokenize and encode text
encoded_text = train_df["Text"].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, padding=True))

In [5]:
# Convert data to tensors and define dataloaders
# Define maximum sequence length
MAX_LEN = 512

# Pad sequences with zeros to ensure consistent length
def pad_sequence(sequence):
    if len(sequence) < MAX_LEN:
        padding = [0] * (MAX_LEN - len(sequence))
        sequence += padding
    return sequence[:MAX_LEN]

# Convert data to tensors and define dataloaders
inputs = torch.tensor([pad_sequence(e) for e in encoded_text])
labels = torch.tensor(train_df["Label"].values)
dataset = torch.utils.data.TensorDataset(inputs, labels)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)


In [6]:
# Define training parameters
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [7]:

# Tokenize and encode text
val_encoded_text = val_df["Text"].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, padding=True))
# Convert data to tensors and define dataloaders
val_inputs = torch.tensor([pad_sequence(e) for e in val_encoded_text])
val_labels = torch.tensor(val_df["Label"].values)
val_dataset = torch.utils.data.TensorDataset(val_inputs, val_labels)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=True)



In [8]:

from torch.utils.data import DataLoader
from tqdm import tqdm

batch_size = 32 
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

for epoch in range(epochs):
    print("Epoch:", epoch + 1)
    model.train()
    total_loss = 0
    with tqdm(total=len(dataloader), desc=f'Epoch {epoch+1}/{epochs}', unit='batch') as pbar:
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids, labels = batch
            outputs = model(input_ids, labels=labels)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            pbar.update(1)
            pbar.set_postfix({'loss': loss.item()})
        
        # calculate validation accuracy
        # Evaluate the model
        model.eval()
        total = 0
        correct = 0
        with torch.no_grad():
           for batch in val_dataloader:
            input_ids, labels = batch
            outputs = model(input_ids)
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        val_acc = correct / total
        avg_loss = total_loss / len(dataloader)
        print("Epoch:", epoch + 1, "Training Loss:",avg_loss, "Validation Accuracy:", val_acc)
    

Epoch: 1


Epoch 1/3: 100%|██████████| 3128/3128 [23:07:03<00:00, 26.61s/batch, loss=0.486]   


Epoch: 1 Training Loss: 0.4821086918025294 Validation Accuracy: 0.8209718670076727
Epoch: 2


Epoch 2/3: 100%|██████████| 3128/3128 [22:50:21<00:00, 26.29s/batch, loss=0.591]    


Epoch: 2 Training Loss: 0.4041874845228765 Validation Accuracy: 0.8321611253196931
Epoch: 3


Epoch 3/3: 100%|██████████| 3128/3128 [21:38:37<00:00, 24.91s/batch, loss=0.107]   

Epoch: 3 Training Loss: 0.36420912670609934 Validation Accuracy: 0.8308823529411765





In [9]:
# define a function to predict whether the comment is offensive or not
def predict_offensiveness(model, tokenizer, comment):
    model.eval()
    encoded_comment = tokenizer.encode_plus(comment, padding=True, truncation=True, max_length=512, return_tensors='pt')
    input_ids = encoded_comment['input_ids'].to(device)
    attention_mask = encoded_comment['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    _, predicted_class = torch.max(probabilities, dim=1)
    return predicted_class.item()


In [10]:
import torch

# define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)


In [11]:

comment = "Allah belanı versin iş yerinde açtım bütün ofis bana bakıyor"
predicted_class = predict_offensiveness(model, tokenizer, comment)
if predicted_class == 0:
    print("The comment is not offensive.")
else:
    print("The comment is offensive.")

The comment is offensive.


In [12]:

comment = "Çok kötü inşallah kimseye bişey olmamıştır."
predicted_class = predict_offensiveness(model, tokenizer, comment)
if predicted_class == 0:
    print("The comment is not offensive.")
else:
    print("The comment is offensive.")

The comment is not offensive.
