<a href="https://colab.research.google.com/github/Omarkhattab146/ML-projects/blob/main/NLPApplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB as multinomialNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

%matplotlib inline
warnings.filterwarnings('ignore')

In [4]:
nltk.download('stopwords')
nltk.download('wordnet') # for lemmitization
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
df = pd.read_csv(
    '/content/IMDB Dataset.csv',
    encoding="ISO-8859-1",
    engine="python",
    on_bad_lines="skip"
)

# ناخد عينة 10% من البيانات بشكل عشوائي وثابت
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

# نعرض أول 5 صفوف للتأكد
df.head()

Unnamed: 0,review,sentiment
0,I really liked this Summerslam due to the look...,positive
1,Not many television shows appeal to quite as m...,positive
2,The film quickly gets to a major chase scene w...,negative
3,Jane Austen would definitely approve of this o...,positive
4,Expectations were somewhat high for me when I ...,negative


In [6]:
def clean(text):
  txt = re.sub(r'[^A-Za-z]'," ",text)
  txt = txt.lower()
  txt = txt.split()
  txt = " ".join(txt)
  return txt

# https://chatgpt.com/s/t_68d6dc8a9e248191857c6c0bd9e4f20a

In [7]:
df['review_cleaned'] = df['review'].apply(clean)

In [8]:
df.head(5)

Unnamed: 0,review,sentiment,review_cleaned
0,I really liked this Summerslam due to the look...,positive,i really liked this summerslam due to the look...
1,Not many television shows appeal to quite as m...,positive,not many television shows appeal to quite as m...
2,The film quickly gets to a major chase scene w...,negative,the film quickly gets to a major chase scene w...
3,Jane Austen would definitely approve of this o...,positive,jane austen would definitely approve of this o...
4,Expectations were somewhat high for me when I ...,negative,expectations were somewhat high for me when i ...


In [9]:
# Convert data to tokenz
df["Tokenized_review"] = df["review_cleaned"].apply(lambda x: word_tokenize(x))

In [10]:
# Remove stop words
stop_words = set(stopwords.words("english"))
df["StopWords_review"] = df["Tokenized_review"].apply(lambda x: [word for word in x if word not in stop_words])

In [11]:
df.head(5)

Unnamed: 0,review,sentiment,review_cleaned,Tokenized_review,StopWords_review
0,I really liked this Summerslam due to the look...,positive,i really liked this summerslam due to the look...,"[i, really, liked, this, summerslam, due, to, ...","[really, liked, summerslam, due, look, arena, ..."
1,Not many television shows appeal to quite as m...,positive,not many television shows appeal to quite as m...,"[not, many, television, shows, appeal, to, qui...","[many, television, shows, appeal, quite, many,..."
2,The film quickly gets to a major chase scene w...,negative,the film quickly gets to a major chase scene w...,"[the, film, quickly, gets, to, a, major, chase...","[film, quickly, gets, major, chase, scene, eve..."
3,Jane Austen would definitely approve of this o...,positive,jane austen would definitely approve of this o...,"[jane, austen, would, definitely, approve, of,...","[jane, austen, would, definitely, approve, one..."
4,Expectations were somewhat high for me when I ...,negative,expectations were somewhat high for me when i ...,"[expectations, were, somewhat, high, for, me, ...","[expectations, somewhat, high, went, see, movi..."


In [12]:
# Lemmatization
lemitizer = WordNetLemmatizer()
def lemmitization_words(text):
  lemmas = [lemitizer.lemmatize(word) for word in text]
  return lemmas

df["Lemmatized_review"] = df["StopWords_review"].apply(lemmitization_words)



In [45]:
df.head(5)

Unnamed: 0,review,sentiment,review_cleaned,Tokenized_review,StopWords_review,Lemmatized_review
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, oz, epis...","[one, reviewer, mentioned, watching, oz, episo..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, br, br, filmin...","[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, there, s, a, family, where, a, lit...","[basically, family, little, boy, jake, thinks,...","[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, s, love, in, the, time, of, m...","[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visually, ..."


https://chatgpt.com/s/t_68d6e167145881919ff5ceb4228ed117

## Text representation

In [13]:
# Join words to texts
df["Lemmatized_review_to_texts"] = df["Lemmatized_review"].apply(lambda x: " ".join(x))


In [69]:
df.head(5)

Unnamed: 0,review,sentiment,review_cleaned,Tokenized_review,StopWords_review,Lemmatized_review,Lemmatized_review_to_texts
0,I really liked this Summerslam due to the look...,positive,i really liked this summerslam due to the look...,"[i, really, liked, this, summerslam, due, to, ...","[really, liked, summerslam, due, look, arena, ...","[really, liked, summerslam, due, look, arena, ...",really liked summerslam due look arena curtain...
1,Not many television shows appeal to quite as m...,positive,not many television shows appeal to quite as m...,"[not, many, television, shows, appeal, to, qui...","[many, television, shows, appeal, quite, many,...","[many, television, show, appeal, quite, many, ...",many television show appeal quite many differe...
2,The film quickly gets to a major chase scene w...,negative,the film quickly gets to a major chase scene w...,"[the, film, quickly, gets, to, a, major, chase...","[film, quickly, gets, major, chase, scene, eve...","[film, quickly, get, major, chase, scene, ever...",film quickly get major chase scene ever increa...
3,Jane Austen would definitely approve of this o...,positive,jane austen would definitely approve of this o...,"[jane, austen, would, definitely, approve, of,...","[jane, austen, would, definitely, approve, one...","[jane, austen, would, definitely, approve, one...",jane austen would definitely approve one br br...
4,Expectations were somewhat high for me when I ...,negative,expectations were somewhat high for me when i ...,"[expectations, were, somewhat, high, for, me, ...","[expectations, somewhat, high, went, see, movi...","[expectation, somewhat, high, went, see, movie...",expectation somewhat high went see movie thoug...


### BOW representation

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
X_train,X_test,y_train,y_test=train_test_split(df['Lemmatized_review_to_texts'],df['sentiment'],test_size=0.2,random_state=42)

In [16]:
vectorizer = CountVectorizer()
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

In [17]:
X_train_bow_dense=X_train_vector.toarray()
X_train_bow_dense[2]

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
X_train_bow_dense.shape

(4000, 31023)

In [19]:
print(X_train_vector.sum())


490107


In [20]:
sample_dense = X_train_vector[:5].toarray()
print(sample_dense)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [21]:
feature_names=vectorizer.get_feature_names_out()
feature_names

array(['aa', 'aaaand', 'aaaggghhhhhhh', ..., 'zwart', 'zwick', 'zz'],
      dtype=object)

https://chatgpt.com/s/t_68dc1703ff0481918ce83d2fd6685ccb

In [22]:
feature_names.shape

(31023,)

In [23]:
bow_df=pd.DataFrame(X_train_bow_dense,columns=feature_names)
bow_df

Unnamed: 0,aa,aaaand,aaaggghhhhhhh,aaagh,aaawwwwnnn,aaja,aak,aakrosh,aaliyah,aamir,...,zucker,zuckerman,zucovic,zukor,zukovic,zuniga,zunz,zwart,zwick,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Label_encoder
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

In [25]:
model = multinomialNB()
model.fit(X_train_vector, y_train)

In [26]:
y_pred = model.predict(X_test_vector)

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84       506
           1       0.84      0.81      0.82       494

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000



In [28]:
vectorizer = CountVectorizer(
    ngram_range=(1,2),   # unigrams + bigrams
    max_features=20000,  # خُد بس أهم 20k كلمة/عبارة
    min_df=5,            # تجاهل اللي ظهر أقل من 5 مرات
    max_df=0.7,          # تجاهل اللي ظهر في 70%+ من النصوص
    stop_words='english' # شيل stop words
)
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

https://chatgpt.com/s/t_68dc15cfde3c8191b855b961558f3279

In [29]:
model = multinomialNB()
model.fit(X_train_vector, y_train)

In [30]:
y_pred = model.predict(X_test_vector)

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84       506
           1       0.85      0.83      0.84       494

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000



### Represent with TF-IDF

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
vectorizer = TfidfVectorizer(
    max_features=10000,   # نحدد أقصى عدد كلمات (features) للحفاظ على الميموري
    ngram_range=(1,2),    # نستخدم unigram + bigram
    stop_words='english'  # نشيل stop words الانجليزية
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [34]:
feature_names = vectorizer.get_feature_names_out()

In [35]:
model = multinomialNB()
model.fit(X_train_tfidf, y_train)

In [36]:
y_pred = model.predict(X_test_tfidf)

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.85      0.85       506
           1       0.85      0.84      0.84       494

    accuracy                           0.84      1000
   macro avg       0.85      0.84      0.84      1000
weighted avg       0.85      0.84      0.84      1000



### Word Embedding Representation

#### From Scratch

In [38]:
X_train,X_test,y_train,y_test=train_test_split(df['Lemmatized_review'],df['sentiment'],test_size=0.2,random_state=42)

In [39]:
X_train

Unnamed: 0,Lemmatized_review
4227,"[yet, another, gay, film, ruined, asinine, pol..."
4676,"[long, run, west, end, charming, film, cast, m..."
800,"[early, oliver, stone, associate, produced, fi..."
3671,"[loathed, film, original, phantasm, wonderful,..."
4193,"[alice, florinda, bolkan, translator, living, ..."
...,...
4426,"[admit, lured, one, hype, stop, consider, sour..."
466,"[frank, capra, wonder, life, film, br, br, kee..."
3092,"[considering, basically, low, budget, cast, su..."
3772,"[bought, rocketship, x, dvd, two, pack, destin..."


In [40]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 🧠 1. بناء القاموس vocab من الـ train فقط
vocab = set([word for sentence in X_train for word in sentence])

# 2. إنشاء قاموس تحويل الكلمة → رقم
word2idx = {word: i+2 for i, word in enumerate(vocab)}  # نبدأ من 2
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

# 3. تحويل الجمل لأرقام (train & test)
X_train_seq = [[word2idx.get(word, word2idx["<UNK>"]) for word in sentence] for sentence in X_train]
X_test_seq  = [[word2idx.get(word, word2idx["<UNK>"]) for word in sentence] for sentence in X_test]

# 4. نعمل padding عشان كل الجمل تبقى بنفس الطول
max_len = 40  # الطول اللي اخترناه
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded  = pad_sequences(X_test_seq,  maxlen=max_len, padding='post', truncating='post')

# 5. نطبع شوية معلومات للتأكد
print(f"Vocab size: {len(word2idx)}")
print(f"Shape of padded X_train: {X_train_padded.shape}")
print("Example encoded sentence:", X_train_seq[0][:10])
print("Example padded sentence:", X_train_padded[0][:10])


Vocab size: 31048
Shape of padded X_train: (4000, 40)
Example encoded sentence: [23304, 5413, 19716, 12107, 26085, 21315, 10593, 8358, 20737, 301]
Example padded sentence: [23304  5413 19716 12107 26085 21315 10593  8358 20737   301]


In [41]:
import torch
import torch.nn as nn

# نحدد المعلمات
vocab_size = 81854 + 1   # عدد الكلمات + padding
embedding_dim = 100      # أبعاد الـ embedding
max_len = 40

# نحول بياناتنا إلى tensors
X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long)

In [42]:
# نبدأ نعرف layer عشوائيًا (بدون pretrained)
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# ناخد مثال من أول جملة
sample_sentence = X_train_tensor[0]
print("Sentence indices:", sample_sentence)


# نمرر الجملة للـ embedding layer
embedded_sentence = embedding_layer(sample_sentence)

print("Shape of embedded sentence:", embedded_sentence.shape)
print("Example embedding vector:\n", embedded_sentence[0])





Sentence indices: tensor([23304,  5413, 19716, 12107, 26085, 21315, 10593,  8358, 20737,   301,
         2490, 14680, 17154, 14419, 25065,  1472, 21867, 28240,  8111,  1959,
        16957,  9413,  1686,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
Shape of embedded sentence: torch.Size([40, 100])
Example embedding vector:
 tensor([-1.5456,  0.1072, -0.6190,  1.6024,  0.1700,  0.8140, -1.7090, -0.2015,
         0.7165,  1.0422,  1.3262,  0.6015,  0.5056, -0.5213,  0.5173,  0.3237,
        -0.7756, -1.2925,  0.9882, -0.6204, -0.8057,  0.0149, -1.2823,  0.1262,
        -0.5593,  0.4473,  1.2678,  0.1730, -1.5878, -0.0310,  1.5618, -1.1088,
        -0.5896, -0.6008, -0.1715, -1.8866, -1.8425, -0.8495, -0.6621, -0.3680,
         0.0262,  1.0513, -0.9207,  0.9702, -0.8266,  0.9466,  0.8034,  0.2701,
         0.3583, -0.7162, -0.5182, -1.5034, -1.4794,  0.4486,  1.2614,  0.0789,
        -2.2066,  0.9100,  0.549

In [43]:
import torch.optim as optim

embed_dim = 100  # حجم الـ embedding vector
num_classes = 1  # لو binary classification

class SimpleTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(SimpleTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)  # learn from scratch
        self.fc = nn.Linear(embed_dim, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)               # (batch_size, seq_len, embed_dim)
        mean_embedded = embedded.mean(dim=1)       # average embeddings across sequence
        out = self.fc(mean_embedded)
        return self.sigmoid(out)

# نعمل نسخة من الموديل
model = SimpleTextClassifier(vocab_size=len(word2idx)+1, embed_dim=100, num_classes=1)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [44]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # حولنا النصوص لأرقام 0 و 1

X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.float32).unsqueeze(1)

In [45]:


for epoch in range(100):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1} - Loss: {loss.item():.4f}")



Epoch 1 - Loss: 0.6975
Epoch 2 - Loss: 0.6963
Epoch 3 - Loss: 0.6952
Epoch 4 - Loss: 0.6940
Epoch 5 - Loss: 0.6928
Epoch 6 - Loss: 0.6917
Epoch 7 - Loss: 0.6906
Epoch 8 - Loss: 0.6894
Epoch 9 - Loss: 0.6883
Epoch 10 - Loss: 0.6872
Epoch 11 - Loss: 0.6861
Epoch 12 - Loss: 0.6850
Epoch 13 - Loss: 0.6839
Epoch 14 - Loss: 0.6828
Epoch 15 - Loss: 0.6817
Epoch 16 - Loss: 0.6806
Epoch 17 - Loss: 0.6795
Epoch 18 - Loss: 0.6784
Epoch 19 - Loss: 0.6773
Epoch 20 - Loss: 0.6762
Epoch 21 - Loss: 0.6750
Epoch 22 - Loss: 0.6739
Epoch 23 - Loss: 0.6727
Epoch 24 - Loss: 0.6716
Epoch 25 - Loss: 0.6704
Epoch 26 - Loss: 0.6692
Epoch 27 - Loss: 0.6680
Epoch 28 - Loss: 0.6667
Epoch 29 - Loss: 0.6655
Epoch 30 - Loss: 0.6642
Epoch 31 - Loss: 0.6629
Epoch 32 - Loss: 0.6616
Epoch 33 - Loss: 0.6602
Epoch 34 - Loss: 0.6588
Epoch 35 - Loss: 0.6574
Epoch 36 - Loss: 0.6560
Epoch 37 - Loss: 0.6546
Epoch 38 - Loss: 0.6531
Epoch 39 - Loss: 0.6516
Epoch 40 - Loss: 0.6500
Epoch 41 - Loss: 0.6484
Epoch 42 - Loss: 0.6468
E

In [46]:
with torch.no_grad():
    preds = model(X_train_tensor)
    preds = (preds > 0.5).float()
    accuracy = (preds == y_train_tensor).sum() / len(y_train_tensor)
    print(f"Training Accuracy: {accuracy:.4f}")


Training Accuracy: 0.8910


#### Pretrained

In [47]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2025-10-05 14:46:00--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-10-05 14:46:00--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-10-05 14:46:01--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2

In [64]:
# Download vectors of glove
import numpy as np

# نحمل الـ embeddings من ملف GloVe
embeddings_index = {}
with open('/content/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

print(f"Loaded {len(embeddings_index)} word vectors.")


Loaded 400000 word vectors.


In [65]:
# Make embedding array
embedding_dim = 100
embedding_matrix = np.zeros((len(word2idx)+1, embedding_dim))

for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [66]:
import torch.nn as nn
import torch.optim as optim

class GloveTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, embedding_matrix):
        super(GloveTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = True
        self.fc = nn.Linear(embed_dim, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        mean_embedded = embedded.mean(dim=1)
        out = self.fc(mean_embedded)
        return self.sigmoid(out)

# إنشاء الموديل
model_glove = GloveTextClassifier(
    vocab_size=len(word2idx)+1,
    embed_dim=embedding_dim,
    num_classes=1,
    embedding_matrix=embedding_matrix
)

criterion = nn.BCELoss()
optimizer = optim.Adam(model_glove.parameters(), lr=0.001)


In [67]:
X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.float32).unsqueeze(1)

for epoch in range(100):
    optimizer.zero_grad()
    outputs = model_glove(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1} - Loss: {loss.item():.4f}")


Epoch 1 - Loss: 0.7078
Epoch 2 - Loss: 0.7051
Epoch 3 - Loss: 0.7026
Epoch 4 - Loss: 0.7002
Epoch 5 - Loss: 0.6979
Epoch 6 - Loss: 0.6957
Epoch 7 - Loss: 0.6936
Epoch 8 - Loss: 0.6916
Epoch 9 - Loss: 0.6896
Epoch 10 - Loss: 0.6878
Epoch 11 - Loss: 0.6861
Epoch 12 - Loss: 0.6844
Epoch 13 - Loss: 0.6828
Epoch 14 - Loss: 0.6813
Epoch 15 - Loss: 0.6798
Epoch 16 - Loss: 0.6785
Epoch 17 - Loss: 0.6771
Epoch 18 - Loss: 0.6758
Epoch 19 - Loss: 0.6746
Epoch 20 - Loss: 0.6734
Epoch 21 - Loss: 0.6722
Epoch 22 - Loss: 0.6710
Epoch 23 - Loss: 0.6698
Epoch 24 - Loss: 0.6687
Epoch 25 - Loss: 0.6676
Epoch 26 - Loss: 0.6664
Epoch 27 - Loss: 0.6653
Epoch 28 - Loss: 0.6641
Epoch 29 - Loss: 0.6630
Epoch 30 - Loss: 0.6618
Epoch 31 - Loss: 0.6606
Epoch 32 - Loss: 0.6594
Epoch 33 - Loss: 0.6582
Epoch 34 - Loss: 0.6570
Epoch 35 - Loss: 0.6557
Epoch 36 - Loss: 0.6544
Epoch 37 - Loss: 0.6531
Epoch 38 - Loss: 0.6518
Epoch 39 - Loss: 0.6505
Epoch 40 - Loss: 0.6491
Epoch 41 - Loss: 0.6478
Epoch 42 - Loss: 0.6464
E

In [68]:
with torch.no_grad():
    preds = model_glove(X_train_tensor)
    preds = (preds > 0.5).float()
    accuracy = (preds == y_train_tensor).sum() / len(y_train_tensor)
    print(f"Training Accuracy: {accuracy:.4f}")


Training Accuracy: 0.9068


## Models

#### Models with RNN