In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as p
# data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [6]:
import pandas as pd

train_df=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [7]:
import torch

In [8]:
print(train_df.columns)
print(train_df.shape,test_df.shape)
print(test_df.columns)

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
(7613, 5) (3263, 4)
Index(['id', 'keyword', 'location', 'text'], dtype='object')


In [9]:
import re
def clean_text(text):
    text=re.sub(r"https\S+","",text)
    text=re.sub(r"[^A-Za-z0-9\s","",text)
    return text.lower()
    

In [10]:
from collections import Counter

def tokenize(text):
    return text.lower().split()

def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        counter.update(tokenize(text))
    vocab = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab


In [11]:
def encode(text, vocab, max_len=32):
    tokens = tokenize(text)
    ids = [vocab.get(token, vocab["<unk>"]) for token in tokens]
    if len(ids) < max_len:
        ids += [vocab["<pad>"]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids


In [12]:
from torch.utils.data import Dataset

class TweetDataset(Dataset):
    def __init__(self, df, vocab):
        self.texts = df["text"].tolist()
        self.labels = df["target"].tolist()
        self.vocab = vocab

    def __getitem__(self, idx):
        x = torch.tensor(encode(self.texts[idx], self.vocab))
        y = torch.tensor(self.labels[idx], dtype=torch.float32)
        return x, y

    def __len__(self):
        return len(self.labels)


In [13]:
import torch.nn as nn

class TweetClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        return torch.sigmoid(self.fc(h_n.squeeze(0)))


In [14]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


In [15]:
def evaluate_model(model, dataloader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs).squeeze()
            preds = (outputs > 0.5).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total


In [16]:
from torch.utils.data import DataLoader, random_split

# Build vocab and dataset
vocab = build_vocab(train_df["text"])
dataset = TweetDataset(train_df, vocab)

# Split and load
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TweetClassifier(vocab_size=len(vocab)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

# Train
for epoch in range(5):
    loss = train_model(model, train_loader, optimizer, criterion, device)
    acc = evaluate_model(model, val_loader, device)
    print(f"Epoch {epoch+1}: Loss={loss:.4f}, Accuracy={acc:.4f}")


Epoch 1: Loss=0.6839, Accuracy=0.5601
Epoch 2: Loss=0.6775, Accuracy=0.5588
Epoch 3: Loss=0.6543, Accuracy=0.6362
Epoch 4: Loss=0.5951, Accuracy=0.6605
Epoch 5: Loss=0.5219, Accuracy=0.7039


In [17]:
# Load test data
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Encode test tweets
test_encoded = [encode(text, vocab) for text in test_df["text"]]
test_tensor = torch.tensor(test_encoded)
test_loader = DataLoader(test_tensor, batch_size=64)


In [18]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        outputs = model(batch).squeeze()
        preds = (outputs > 0.5).int().cpu().numpy()
        predictions.extend(preds)


In [19]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "target": predictions
})
submission.to_csv("submission.csv", index=False)
