<a href="https://colab.research.google.com/github/Skander28/Models/blob/main/bilstm_SA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences


In [77]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [78]:
!ls /content/drive/MyDrive/

'carte visite badra.gdoc'
 Classroom
'Colab Notebooks'
 django
 entreprise-1-20220417T102515Z-001.zip
'Espace citoyen.pdf'
 ez-learning
 maghrebi
'model (1).pth'
 preprocessed_tweets.csv
 pre_tweets.csv
 Projet_Java_FediBoussaada_LaribiSkander_2A
 Projet_Microservices
 trainer2.pth
'Untitled presentation.gslides'


In [79]:
import pandas as pd
filtered_df = pd.read_csv('/content/drive/MyDrive/pre_tweets.csv')
filtered_df.head()

Unnamed: 0.1,Unnamed: 0,id,tweets,dialect
0,0,1009754958479151232,قليلين ادب ومنافقين اختهم او قريبتهم تعاكس تقو...,1
1,1,1009794751548313600,اليبين متقلبين بالنسبه ليا انا ميليشياوي زمان ...,1
2,2,1019989115490787200,تانيه شاب ليبي بيرتاح لبنت مختلفه ويلاحظ انها ...,1
3,3,1035479791758135168,رانيا عقليتك متخلفه اولا الانسان يلي يحتاج اهل...,1
4,4,1035481122921164800,شكلك متعقده علشان الراجل تحبيه ازوج بنت يتيمه ...,1


In [80]:
filtered_df = filtered_df.dropna()

In [81]:
# Load data
features = filtered_df.tweets.values
labels = pd.get_dummies(filtered_df['dialect']).values

In [82]:
vocab_size = 20000
max_length= 200
tokenizer = Tokenizer(num_words=vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=False)
tokenizer.fit_on_texts(features)
X = tokenizer.texts_to_sequences(features)

In [83]:
import nltk
from collections import Counter
nltk.download('stopwords')
from nltk.corpus import stopwords

# use the stopwords
stop_words = set(stopwords.words('arabic'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [84]:
all_words = []
for tokens in X:
    all_words.extend(tokens)

stop_words = set(stopwords.words('arabic'))
all_words = [word for word in all_words if word not in stop_words]

word_counts = Counter(all_words)
most_common_words = [word for word, count in word_counts.most_common(500)]

def remove_common_words(tokens):
  new_tokens = [token for token in tokens if token not in most_common_words]
  return new_tokens

X = remove_common_words(X)


X = pad_sequences(X, maxlen=max_length)

In [85]:
# Split data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.1, random_state=42, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, shuffle=True)

In [86]:
#from imblearn.over_sampling import RandomOverSampler
#rus=RandomOverSampler(random_state=42)
#X_res, y_res = rus.fit_resample(X_train,y_train)

In [87]:
#y_res.shape

In [88]:
# Convert data to PyTorch tensors
X_train_, y_train_ = torch.tensor(X_train), torch.tensor(y_train)
X_val, y_val = torch.tensor(X_val), torch.tensor(y_val)
X_test, y_test = torch.tensor(X_test), torch.tensor(y_test)

In [89]:
class DialectDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [90]:
# Create dataloaders for training, validation, and test sets
train_dataset = DialectDataset(X_train_, y_train_ )
val_dataset = DialectDataset(X_val, y_val)
test_dataset = DialectDataset(X_test, y_test)

In [91]:
batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [92]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np

vocab_size = 20000
embedding_dim = 100
hidden_dim = 128
output_dim = 4
num_layers = 2
bidirectional = True
lr = 0.01
batch_size = 64
num_epochs = 100
patience = 5

class BiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers):
        super(BiLSTMAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=0.5, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.5)
        self.attention = SelfAttention(hidden_dim * 2)
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out)
        attention_out, attention_weights = self.attention(lstm_out)
        fc_out = self.fc(attention_out)
        return fc_out, attention_weights

class SelfAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(SelfAttention, self).__init__()
        self.projection = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, encoder_outputs):
        energy = self.projection(encoder_outputs)
        weights = F.softmax(energy.squeeze(-1), dim=1)
        outputs = (encoder_outputs * weights.unsqueeze(-1)).sum(dim=1)
        return outputs, weights

In [93]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMAttention(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers).to(device)
optimizer = optim.Adam(model.parameters(),lr=0.01)
criterion = nn.BCEWithLogitsLoss()
counter_nb = 0
for epoch in tqdm(range(24)):
     model.train()
     counter_nb = counter_nb + 1 
     print(counter_nb)
     running_loss = 0.0
     with torch.cuda.device(0):
       for batch in train_dataloader:
         inputs, labels = batch[0].to(device), batch[1].to(device)
         optimizer.zero_grad()
         labels = labels.float()
         outputs, _ = model(inputs)
         loss = criterion(outputs, labels)
         loss.backward()
         optimizer.step()
         running_loss += loss.item()
         preds = torch.sigmoid(outputs) > 0.5
         acc = accuracy_score(labels.cpu().detach().numpy(), preds.cpu().detach().numpy())
         #print('Train Accuracy: {:.4f}'.format(acc))
     epoch_loss = running_loss / (len(train_dataloader))

     best_val_loss = np.inf
     patience = 2
     counter = 0
     val_loss = 0.0
     model.eval()
     with torch.no_grad():
         for batch in val_dataloader:
             inputs, labels = batch[0].to(device), batch[1].to(device)
             labels = labels.float()
             outputs, _= model(inputs)
             loss = criterion(outputs, labels)
             val_loss += loss.item()
             #val_preds = torch.sigmoid(outputs) > 0.5
             #val_acc = accuracy_score(labels.cpu().detach().numpy(), val_preds.cpu().detach().numpy())
             #print('Val Accuracy: {:.4f}'.format(val_acc))
     val_loss = val_loss / (len(val_dataloader))
     # Check if the validation loss has improved
     if val_loss < best_val_loss:
                best_val_loss = val_loss
                counter = 0
     else:
         counter += 1

     # Stop the training process if the validation loss hasn't improved for `patience` epochs
     if counter >= patience:
         break
     print(" epoch loss :", epoch_loss , "| val loss :", val_loss)
     #print('Train Accuracy: {:.4f}'.format(acc))
     #print('Val Accuracy: {:.4f}'.format(val_acc))

print("Training stopped after epoch", epoch)
        

  0%|          | 0/24 [00:00<?, ?it/s]

1


  4%|▍         | 1/24 [00:58<22:26, 58.54s/it]

 epoch loss : 0.347642312326279 | val loss : 0.2396483038669651
2


  8%|▊         | 2/24 [01:58<21:46, 59.37s/it]

 epoch loss : 0.20421805802845594 | val loss : 0.21092944284810602
3


 12%|█▎        | 3/24 [02:58<20:51, 59.61s/it]

 epoch loss : 0.17226172826158132 | val loss : 0.20912825591066508
4


 17%|█▋        | 4/24 [03:56<19:39, 59.00s/it]

 epoch loss : 0.15740956026794656 | val loss : 0.20712933146837845
5


 21%|██        | 5/24 [04:54<18:34, 58.67s/it]

 epoch loss : 0.15176892210225457 | val loss : 0.21895743570136791
6


 25%|██▌       | 6/24 [05:52<17:31, 58.41s/it]

 epoch loss : 0.1480972409396769 | val loss : 0.2085564204500717
7


 29%|██▉       | 7/24 [06:50<16:31, 58.31s/it]

 epoch loss : 0.1442463169180792 | val loss : 0.21846717936032026
8


 33%|███▎      | 8/24 [07:48<15:32, 58.27s/it]

 epoch loss : 0.14703599987067767 | val loss : 0.23482504761913447
9


 38%|███▊      | 9/24 [08:46<14:31, 58.10s/it]

 epoch loss : 0.1549118260482076 | val loss : 0.24079680258353936
10


 42%|████▏     | 10/24 [09:44<13:32, 58.04s/it]

 epoch loss : 0.14838891303599777 | val loss : 0.22636417028250047
11


 46%|████▌     | 11/24 [10:42<12:33, 57.93s/it]

 epoch loss : 0.14288568380216335 | val loss : 0.2347055903626877
12


 50%|█████     | 12/24 [11:39<11:33, 57.82s/it]

 epoch loss : 0.1398275776774304 | val loss : 0.22087170903399153
13


 54%|█████▍    | 13/24 [12:37<10:35, 57.74s/it]

 epoch loss : 0.13669582878725967 | val loss : 0.2195957236498305
14


 58%|█████▊    | 14/24 [13:34<09:36, 57.68s/it]

 epoch loss : 0.13469061033308377 | val loss : 0.21958703280073924
15


 62%|██████▎   | 15/24 [14:32<08:38, 57.61s/it]

 epoch loss : 0.13261379340186175 | val loss : 0.23029729608193183
16


 67%|██████▋   | 16/24 [15:29<07:40, 57.57s/it]

 epoch loss : 0.13101867464012185 | val loss : 0.21698190315255841
17


 71%|███████   | 17/24 [16:27<06:42, 57.52s/it]

 epoch loss : 0.12994338269849426 | val loss : 0.22165698417037435
18


 75%|███████▌  | 18/24 [17:24<05:44, 57.44s/it]

 epoch loss : 0.12937737102579985 | val loss : 0.22356818717660257
19


 79%|███████▉  | 19/24 [18:21<04:46, 57.39s/it]

 epoch loss : 0.12741259782720213 | val loss : 0.22118395252279865
20


 83%|████████▎ | 20/24 [19:18<03:49, 57.35s/it]

 epoch loss : 0.12855933687057008 | val loss : 0.2303432983174486
21


 88%|████████▊ | 21/24 [20:16<02:51, 57.30s/it]

 epoch loss : 0.13002403763168718 | val loss : 0.2264270671073673
22


 92%|█████████▏| 22/24 [21:13<01:54, 57.26s/it]

 epoch loss : 0.13505918334653497 | val loss : 0.22602410751928403
23


 96%|█████████▌| 23/24 [22:10<00:57, 57.26s/it]

 epoch loss : 0.13934100390053092 | val loss : 0.23694483194391705
24


100%|██████████| 24/24 [23:07<00:00, 57.83s/it]

 epoch loss : 0.14030971712920637 | val loss : 0.2400471658888951
Training stopped after epoch 23





In [95]:
from sklearn.metrics import f1_score, precision_score, recall_score
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        inputs, labels = batch[0].to(device), batch[1].to(device)
        labels = labels.float()
        outputs, _ = model(inputs)
        preds = torch.sigmoid(outputs) > 0.5
        y_true.extend(labels.cpu().detach().numpy())
        y_pred.extend(preds.cpu().detach().numpy())
test_acc = accuracy_score(y_true, y_pred)
test_f1 = f1_score(y_true, y_pred, average='macro')
test_pres = precision_score(y_true, y_pred, average='macro')
test_recall = recall_score(y_true, y_pred, average='macro')
print('Test Accuracy: {:.4f}'.format(test_acc))
print('Test f1 score: {:.4f}'.format(test_f1))
print('Test precision: {:.4f}'.format(test_pres))
print('Test recall: {:.4f}'.format(test_recall))

Test Accuracy: 0.7977
Test f1 score: 0.8328
Test precision: 0.8663
Test recall: 0.8037


In [101]:
# Set the model to evaluation mode
model.eval()
device = "cuda"
# Tokenize the new complaint and pad the sequence
#new_complaint = ['واش دير للعشى']
#new_complaint  = ["شوكران علا هاد "]
#new_complaint  = ["نبي نروح للحوش"]
new_complaint  = ["شبيك شتحب "]
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=max_length)

# Convert the padded sequence to a PyTorch tensor and move it to the device (e.g., GPU) if available
padded_tensor = torch.LongTensor(padded).to(device)

# Compute the model's prediction for the padded sequence
with torch.no_grad():
    pred = model(padded_tensor)

# Move the prediction back to the CPU and convert to a numpy array
#pred = pred.cpu().numpy()

# Map the prediction to a class label using the CLASS_DICT{'DZ': 0, 'LY': 1, 'MA': 2, 'TN': 3}
CLASS_DICT = {0: "DZ",1: "LY", 2: "MA",3: "TN"}
class_label = CLASS_DICT[np.argmax(pred[0].cpu().numpy(), axis=1)[0]]

# Print the prediction and the predicted class label
print(class_label)

TN


In [102]:
torch.save(model.state_dict(), 'model3.pth')