In [293]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitter-sentiment-analysis/twitter_validation.csv
/kaggle/input/twitter-sentiment-analysis/twitter_training.csv


In [294]:
print(os.listdir("/kaggle/input"))

['twitter-sentiment-analysis']


In [295]:
import torch
import torch.nn as nn
import pandas as pd
import re

from collections import Counter

In [296]:
df = pd.read_csv("/kaggle/input/twitter-sentiment-analysis/twitter_training.csv",
                 encoding='latin-1',
                 names=['id','topic','sentiment','text'])

In [297]:
df = df[['text', 'sentiment']]
df = df[df['sentiment'].isin(['Positive', 'Negative'])]
df = df.sample(30000, random_state=42).reset_index(drop=True)

In [298]:
def clean_text(text):
    import re
    stop_words = {
        'i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself',
        'yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself',
        'they','them','their','theirs','themselves','what','which','who','whom','this','that',
        'these','those','am','is','are','was','were','be','been','being','have','has','had',
        'having','do','does','did','doing','a','an','the','and','but','if','or','because','as',
        'until','while','of','at','by','for','with','about','against','between','into','through',
        'during','before','after','above','below','to','from','up','down','in','out','on','off',
        'over','under','again','further','then','once','here','there','when','where','why','how',
        'all','any','both','each','few','more','most','other','some','such','no','nor','not',
        'only','own','same','so','than','too','very','s','t','can','will','just','don','should',
        'now'
    }
    
    
    text = re.sub(r"http\S+|@\w+|[^a-zA-Z\s]", " ", str(text))
    text = text.lower()
    words = [w for w in text.split() if w not in stop_words and len(w) > 1]
    return " ".join(words)


In [299]:
df['clean_text'] = df['text'].fillna('').apply(clean_text)
df['sentiment'] = df['sentiment'].map({'Negative': 0, 'Positive': 1})
df = df[['clean_text', 'sentiment']]

print(df['sentiment'].value_counts())

sentiment
0    15657
1    14343
Name: count, dtype: int64


In [300]:
def tokenize(text):
    return text.split()

counter = Counter(word for text in df['clean_text'] for word in tokenize(text))
most_common = counter.most_common(10000)
vocab = {word: i+2 for i, (word, _) in enumerate(most_common)}
vocab['<pad>'] = 0
vocab['unk'] = 1

def encode(text):
    encoded = [vocab.get(word, vocab['unk']) for word in text.lower().split()]
    vocab_size = len(vocab)
    # tokenlar vocab chegarasidan chiqmasligi uchun
    encoded = [min(tok, vocab_size - 1) for tok in encoded]
    return torch.tensor(encoded, dtype=torch.long)

In [301]:
class TwitterDataset(Dataset):
    def __init__(self, df):
        self.X = [encode(t) for t in df['clean_text']]

        self.y = torch.tensor(df['sentiment'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    Xs, ys = zip(*batch)
    Xs_padded = pad_sequence(Xs, batch_first=True, padding_value=0).long()
    ys = torch.stack(ys).float()  # ðŸ”¹ Warning yoâ€˜q
    return Xs_padded, ys


train_loader = DataLoader(TwitterDataset(df), batch_size=64, shuffle=True, collate_fn=collate_fn)

In [302]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (h, _) = self.lstm(x)
        out = self.fc(h[-1])
        return out  # sigmoid yoâ€˜q!

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentLSTM(len(vocab)).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [303]:
for epoch in range(10):
    model.train()
    total_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        y_pred = model(X).squeeze(1)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1, Loss: 0.6349
Epoch 2, Loss: 0.4694
Epoch 3, Loss: 0.3639
Epoch 4, Loss: 0.2901
Epoch 5, Loss: 0.2498
Epoch 6, Loss: 0.1985
Epoch 7, Loss: 0.1732
Epoch 8, Loss: 0.1416
Epoch 9, Loss: 0.1279
Epoch 10, Loss: 0.1245


In [304]:
def predict(text):
    model.eval()
    with torch.no_grad():
        tokens = [vocab.get(w, 1) for w in text.lower().split()]
        X = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
        output = model(X)
        prob = torch.sigmoid(output).item()
        label = "positive ðŸ˜€" if prob >= 0.5 else "negative ðŸ˜ž"
        print(f"Probability: {prob:.2f}, Label: {label}")

In [305]:
predict("i really love it")
predict("it is ok")
predict("it is wonderful")
predict("i do not like because it is awful")
predict("it is bad")

Probability: 0.94, Label: positive ðŸ˜€
Probability: 0.55, Label: positive ðŸ˜€
Probability: 0.59, Label: positive ðŸ˜€
Probability: 0.21, Label: negative ðŸ˜ž
Probability: 0.43, Label: negative ðŸ˜ž
