In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [6]:
df = pd.read_csv('IMDB-Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [8]:
def clean_text(text):
    text = re.sub("<.*?>", ' ', text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    return text.lower()

In [9]:
df['clean_review'] = df['review'].apply(clean_text)

In [10]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

In [11]:
vocab = {}
idx = 0
for text in train_df['clean_review']:
    for word in text.split():
        if word not in vocab:
            vocab[word] = idx
            idx += 1

In [12]:
vocab_size = len(vocab)

In [13]:
print("Vocab size: ", vocab_size)

Vocab size:  90515


In [14]:
def encode(text):
    vec = torch.zeros(vocab_size)
    for word in text.split():
        if word in vocab:
            vec[vocab[word]] += 1
    return vec

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.x = [encode(t) for t in texts]
        self.y = labels.value

    def __len__(self):
        return len(self.x)

    def __getitem__(self, item):
        return self.x[item], self.y[item]

train_ds = ReviewDataset(train_df['clean_review'][:5], train_df['label'][:5])
test_ds = ReviewDataset(test_df['clean_review'][:5], test_df['label'][:5])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)

In [None]:
class LogisticModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import re

# 1) تحميل البيانات
df = pd.read_csv("IMDB Dataset.csv")  # تأكّد أن الاسم والمسار صحيح
# العمود 'review' هو النص، و 'sentiment' هو 'positive' أو 'negative'
df = df.sample(frac=1).reset_index(drop=True)  # خلط عشوائي

# 2) ترميز التصنيف
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# 3) تنظيف النص (remove HTML tags + to lowercase + non‑letters → spaces)
def clean_text(text):
    text = re.sub(r"<.*?>", " ", text)         # إزالة HTML
    text = re.sub(r"[^a-zA-Z']", " ", text)    # إزالة ما ليس حروف
    return text.lower()

df['clean_review'] = df['review'].apply(clean_text)

# 4) تقسيم بيانات إلى Train / Test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# 5) بناء معجم (vocab) من كلمات التدريب
vocab = {}
idx = 0
for text in train_df['clean_review']:
    for word in text.split():
        if word not in vocab:
            vocab[word] = idx
            idx += 1

vocab_size = len(vocab)
print("Vocab size:", vocab_size)

def encode(text):
    vec = torch.zeros(vocab_size)
    for word in text.split():
        if word in vocab:
            vec[vocab[word]] += 1
    return vec

# 6) Dataset + DataLoader
class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.X = [encode(t) for t in texts]
        self.y = labels.values

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_ds = ReviewDataset(train_df['clean_review'], train_df['label'])
test_ds  = ReviewDataset(test_df['clean_review'],  test_df['label'])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=64, shuffle=False)

# 7) بناء نموذج Logistic Regression
class LogisticModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

model = LogisticModel(vocab_size)
loss_fn = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# 8) تدريب النموذج
for epoch in range(5):
    model.train()
    total_loss = 0
    for X, y in train_loader:
        y = y.float().unsqueeze(1)
        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item() * X.size(0)
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_ds):.4f}")

# 9) اختبار النموذج
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X, y in test_loader:
        pred = model(X)
        predicted = (pred > 0.5).long().squeeze(1)
        correct += (predicted == y).sum().item()
        total   += y.size(0)
print("Accuracy:", correct / total)
