In [185]:
import pandas as pd
import regex
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
from transformers import AutoTokenizer, AutoModel

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import os
# import copy

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [207]:
df1 = pd.read_csv("positive.csv")
df2 = pd.read_csv("negative.csv")
df = pd.concat([df1, df2], ignore_index=True)


In [188]:

def separate_emojis(words):
    separated = []
    emoji_pattern = regex.compile(r'\p{Emoji}', flags=regex.UNICODE)

    for word in words:
        current = ""
        for char in word:
            if char.isdigit():
                current += char
            elif emoji_pattern.match(char):
                print(f"Separating emoji: {char}")
                if current:
                    separated.append(current)
                    current = ""
                separated.append(char)
            else:
                current += char
        if current:
            separated.append(current)

    return separated

def is_arabic(word):
    return any('\u0600' <= char <= '\u06FF' for char in word)
def is_english(word):
    return all('a' <= char.lower() <= 'z' for char in word)
def is_emoji(word):
    emoji_pattern = regex.compile(r'\p{Emoji}', flags=regex.UNICODE)
    return any(emoji_pattern.match(char) for char in word)


In [189]:
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
embedder = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")

In [190]:
df['embedding'] = None

for i in range(len(df)):
  text = df.iloc[i]["text"]
  tokens = tokenizer(text, return_tensors="pt")
  embedding = embedder(**tokens) # equivilant to embedder(tokens['input_ids'],tokens['attention_mask'])
  embedding = embedding[0][0].detach().cpu().numpy()

  df.at[i, 'embedding'] = embedding

  print(f"{i} Element has beed embedded with shape {embedding.shape}")

0 Element has beed embedded with shape (115, 768)
1 Element has beed embedded with shape (163, 768)
2 Element has beed embedded with shape (82, 768)
3 Element has beed embedded with shape (30, 768)
4 Element has beed embedded with shape (102, 768)
5 Element has beed embedded with shape (240, 768)
6 Element has beed embedded with shape (76, 768)
7 Element has beed embedded with shape (51, 768)
8 Element has beed embedded with shape (83, 768)
9 Element has beed embedded with shape (23, 768)
10 Element has beed embedded with shape (88, 768)
11 Element has beed embedded with shape (120, 768)
12 Element has beed embedded with shape (97, 768)
13 Element has beed embedded with shape (120, 768)
14 Element has beed embedded with shape (68, 768)
15 Element has beed embedded with shape (15, 768)
16 Element has beed embedded with shape (19, 768)
17 Element has beed embedded with shape (74, 768)
18 Element has beed embedded with shape (19, 768)
19 Element has beed embedded with shape (73, 768)
20 E

In [191]:
df = df.drop("text", axis=1)
df.columns

Index(['label', 'embedding'], dtype='object')

In [192]:
class ModelDataset(Dataset):
  def __init__(self, df, transformation=None):
    self.df = df.copy()

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    label, embedding = self.df.iloc[idx].values # first return is the label, it's conformed don't change it
    # embedding = embedding.numpy()  # I don't need this
    embedding = embedding
    embedding = torch.from_numpy(embedding)
    label = torch.tensor(label)
    return embedding, label

In [193]:
def collate_fn(batch):
    sequences = [item[0] for item in batch]
    labels = torch.tensor([item[1] for item in batch], dtype=torch.float)
    lengths = torch.tensor([seq.size(0) for seq in sequences], dtype=torch.long)

    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    return padded_sequences, labels, lengths

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

training_dataset = ModelDataset(train_df)
testing_dataset = ModelDataset(test_df)

train_dataloader = DataLoader(training_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(testing_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)


In [194]:
class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()
    self.lstm = nn.LSTM(input_size = 768 , hidden_size=128, num_layers=3, dropout=0.2, batch_first=True)
    self.dropout = nn.Dropout(0.3)
    self.fc = nn.Linear(128,32)
    self.output = nn.Linear(32, 1)
    self.relu = nn.ReLU()

  def forward(self, x, lengths=None):
    if lengths is not None:
      x = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
    output, (h_n, c_n) = self.lstm(x)
    # output, _ = pad_packed_sequence(packed_output, batch_first=True)
    last_hidden = h_n[-1]
    x = self.fc(last_hidden)
    x = self.relu(x)
    x = self.dropout(x)
    return self.output(x)


In [195]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [196]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,193
1,69


In [197]:




model = Model()
model = model.to(device)

num_pos = 69
num_neg = 193

pos_weight = torch.tensor(num_neg / num_pos).to(device)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

opt = torch.optim.Adam(params=model.parameters(), lr=0.0001)

epochs = 8

train_losses = []
train_accs = []
test_losses = []
test_accs = []
best_model = 0
best_acc = 0
for epoch in range(epochs):
    # --- TRAINING ---
    model.train()
    train_correct = 0
    train_total = 0
    train_loss = 0

    for x, y, lengths in train_dataloader:
        x = x.float().to(device)
        y = y.float().to(device).view(-1)
        lengths = lengths.to(device)

        opt.zero_grad()
        y_hat = model(x, lengths).squeeze(-1).view(-1)
        loss = loss_fn(y_hat, y)
        loss.backward()
        opt.step()

        train_loss += loss.item()
        y_pred = (torch.sigmoid(y_hat) >= 0.5).float()
        train_correct += (y_pred == y).sum().item()
        train_total += y.size(0)

    train_acc = train_correct / train_total
    train_loss /= len(train_dataloader)

    # --- TESTING ---
    model.eval()
    test_correct = 0
    test_total = 0
    test_loss = 0

    with torch.no_grad():
        for x, y, lengths in test_dataloader:
            x = x.float().to(device)
            y = y.float().to(device).view(-1)
            lengths = lengths.to(device)

            y_hat = model(x, lengths).squeeze(-1).view(-1)
            loss = loss_fn(y_hat, y)
            test_loss += loss.item()

            y_pred = (torch.sigmoid(y_hat) >= 0.5).float()
            test_correct += (y_pred == y).sum().item()
            test_total += y.size(0)

    test_acc = test_correct / test_total
    test_loss /= len(test_dataloader)

    if test_acc > best_acc:
      best_acc = test_acc
      best_model = copy.deepcopy(model)

      print("Saved best model")

    # --- LOGGING ---
    print(f"Epoch {epoch + 1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    test_losses.append(test_loss)
    test_accs.append(test_acc)





Saved best model
Epoch 1 | Train Loss: 1.0032 | Train Acc: 0.2584 | Test Loss: 0.9916 | Test Acc: 0.2830
Epoch 2 | Train Loss: 0.9908 | Train Acc: 0.2584 | Test Loss: 1.0076 | Test Acc: 0.2830
Epoch 3 | Train Loss: 0.9739 | Train Acc: 0.2632 | Test Loss: 0.9473 | Test Acc: 0.2830
Saved best model
Epoch 4 | Train Loss: 0.9277 | Train Acc: 0.5072 | Test Loss: 0.9378 | Test Acc: 0.8491
Saved best model
Epoch 5 | Train Loss: 0.8233 | Train Acc: 0.9091 | Test Loss: 0.7633 | Test Acc: 0.9811
Saved best model
Epoch 6 | Train Loss: 0.6381 | Train Acc: 0.9713 | Test Loss: 0.5318 | Test Acc: 1.0000
Epoch 7 | Train Loss: 0.4638 | Train Acc: 0.9904 | Test Loss: 0.3385 | Test Acc: 1.0000
Epoch 8 | Train Loss: 0.3161 | Train Acc: 1.0000 | Test Loss: 0.2232 | Test Acc: 1.0000


In [203]:
torch.save(model.state_dict(), "model4.pth")

In [204]:
def predict(text, threshold=0.75):
  tokens = tokenizer(text, return_tensors="pt")
  embedding = embedder(**tokens) # equivilant to embedder(tokens['input_ids'],tokens['attention_mask'])
  embedding = embedding[0][0].detach().cpu()
  model.eval()

  with torch.no_grad():
    y_hat = model(embedding)
    y_hat = torch.sigmoid(y_hat).item()

  result = (threshold <= y_hat)
  return f"{result} : {y_hat:4f}"



In [205]:
predict(
"""
تسوي
""")

'False : 0.382781'

In [206]:
positive_test_texts = [
    "تسوي أعذار طبية مضمونه",
    "حل واجبات ومشاريع دراسية بجودة عالية",
    "تقديم تقارير وأبحاث متكاملة لجميع التخصصات",
    "خدمات طلابية متكاملة في مشاريع التخرج",
    "عمل سكليفات طبية معتمدة من صحتي",
    "حل جميع الواجبات والاختبارات بوقت قياسي",
    "إعداد بحوث ورسائل ماجستير ودكتوراه",
    "تصميم عروض بوربوينت وتقارير دراسية",
    "كتابة وتلخيص المقالات العلمية",
    "إعداد تكاليف ومشاريع دراسية مضمونة",
    "خدمة متميزة لحل المشكلات الأكاديمية",
    "تسليم مشاريع التخرج بجودة احترافية",
    "تحضير عروض تقديمية مميزة لجميع المواد",
    "كتابة الأبحاث العلمية باحترافية عالية",
    "تقديم المساعدة في الواجبات المنزلية",
    "حل الأسايمنت والاختبارات مع ضمان الدرجة",
    "تصميم ملخصات دراسية سهلة الفهم",
    "دعم في كتابة المقالات والبحوث",
    "إعداد مشاريع التخرج في جميع التخصصات",
    "توفير سكليفات طبية رسمية ومضمونة"
]

negative_test_texts = [
    "مشاريعنا واجد ومربكة",
    "ضغط مشاريع ما يخلص",
    "المشاريع ذبحتنا والله",
    "أحد عنده حل الواجب؟",
    "المشروع تعجيزي بشكل كبير",
    "مشاريع صعبة وما نفهمها",
    "ما عندي وقت أحل الواجبات",
    "مشاريع الدراسة تأخذ كل وقتي",
    "وين ألقى حل للواجب؟",
    "مشاريع معقدة ومحبطة",
    "مشاكل في فهم المطلوب",
    "تعبت من كثرة المشاريع والواجبات",
    "ما أدري شلون أخلص المشروع",
    "مشاريع متعبة ومحبطة",
    "مشاريع صعبة جدا وما في حلول",
    "ما فهمت المطلوب في المشروع",
    "مشاريع معقدة وتحتاج مساعدة",
    "ما ألقى حد يساعدني في المشروع",
    "المشروع غير واضح ومبهم",
    "مشاريع مرهقة ومحبطة جداً"
]

for t in positive_test_texts:
  print(predict(t))

print()
print()

for t in negative_test_texts:
  print(predict(t))

True : 0.832078
True : 0.847335
True : 0.845804
True : 0.845601
True : 0.856987
True : 0.853052
True : 0.844613
True : 0.850214
True : 0.838108
True : 0.836252
True : 0.835173
True : 0.840805
True : 0.851641
True : 0.847101
True : 0.791543
True : 0.856762
True : 0.847784
True : 0.826004
True : 0.848102
True : 0.853967


False : 0.235946
False : 0.272286
False : 0.244860
False : 0.196010
False : 0.346791
False : 0.467127
False : 0.144665
False : 0.262793
False : 0.129117
False : 0.683481
False : 0.229034
False : 0.184679
False : 0.131185
False : 0.650024
False : 0.225384
False : 0.142873
False : 0.646466
False : 0.130977
False : 0.247643
False : 0.260327
