In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(42)

from torch.utils.data import Dataset, DataLoader

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [88]:
train_val_df_raw = pd.read_csv("train.csv")
test_df_raw = pd.read_csv("test.csv")

In [89]:
train_df_raw, val_df_raw = train_test_split(train_val_df_raw, test_size=0.1, random_state=42)

In [90]:
train_df_raw.head(2)

Unnamed: 0,id,keyword,location,text,target
4620,6568,injury,"Plano, Texas",'McFadden Reportedly to Test Hamstring Thursda...,0
2858,4107,drought,Nigeria,w--=-=-=-[ NEMA warns Nigerians to prepare for...,1


In [91]:
test_df_raw.head(2)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."


In [92]:
val_df_raw.head(2)

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0


In [93]:
train_df = train_df_raw.drop(["id", "keyword", "location"], axis=1)
val_df = val_df_raw.drop(["id", "keyword", "location"], axis=1)
test_df = test_df_raw.drop(["id", "keyword", "location"], axis=1)

test_df["target"] = 0  # hacky way for custom dataset class :)

In [94]:
tokenizer = RegexpTokenizer('\w+')
sw = stopwords.words("english")
ps = PorterStemmer()

In [95]:
def clean_text(text):
    tokens = tokenizer.tokenize(text.lower())
    clean_words = [w for w in tokens if w not in sw]
    text = " ".join([ps.stem(w) for w in clean_words])
    return text

In [96]:
train_df.text = train_df.text.apply(clean_text)
val_df.text = val_df.text.apply(clean_text)
test_df.text = test_df.text.apply(clean_text)

In [97]:
cv = CountVectorizer()
cv.fit_transform(train_df.text)

<6851x17201 sparse matrix of type '<class 'numpy.int64'>'
	with 73173 stored elements in Compressed Sparse Row format>

In [98]:
train_df.shape, test_df.shape, val_df.shape

((6851, 2), (3263, 2), (762, 2))

In [99]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        X = cv.transform([self.df.text.iloc[idx]]).toarray()[0]
        X = torch.tensor(X, dtype=torch.float32)
        y = self.df.target.iloc[idx]
        return X, y

In [100]:
training_data = CustomDataset(train_df)
test_data = CustomDataset(test_df)
val_data = CustomDataset(val_df)

In [101]:
train_dataloader = DataLoader(training_data, batch_size=512, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=512, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=512, shuffle=True)

In [102]:
for x, y in train_dataloader:
    print(x.shape)
    print(y.shape)
    break

torch.Size([512, 17201])
torch.Size([512])


In [103]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [104]:
in_features = cv.get_feature_names_out().shape[0]
print(in_features)

17201


In [107]:
class RNNModel(nn.Module):
    def __init__(self, hidden_dim, layer_dim):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.RNN(in_features, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu')
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.fc2 = nn.Linear(64, 2)

    def forward(self, x):
        h0 = torch.zeros((self.layer_dim, self.hidden_dim), device=device)
        out, hn = self.rnn(x, h0)
        out = F.relu(self.fc1(out))
        out = self.fc2(out)
        return out

hidden_dim = 128
layer_dim = 1
model = RNNModel(hidden_dim, layer_dim).to(device)
print(model)

RNNModel(
  (rnn): RNN(17201, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=2, bias=True)
)


In [81]:
# class NeuralNet(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.fc1 = nn.Linear(in_features, 10)
#         self.bn1 = nn.BatchNorm1d(10)
#         self.fc2 = nn.Linear(10, 2)

#     def forward(self, x):
#         x = F.relu(self.bn1(self.fc1(x)))
#         x = self.fc2(x)
#         return x

# model = NeuralNet().to(device)
# print(model)

In [108]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

In [109]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [110]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [112]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(val_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.329795  [  512/ 6851]
Test Error: 
 Accuracy: 77.7%, Avg loss: 0.482509 

Epoch 2
-------------------------------
loss: 0.285260  [  512/ 6851]
Test Error: 
 Accuracy: 75.7%, Avg loss: 0.480915 

Epoch 3
-------------------------------
loss: 0.237643  [  512/ 6851]
Test Error: 
 Accuracy: 78.2%, Avg loss: 0.476086 

Epoch 4
-------------------------------
loss: 0.176351  [  512/ 6851]
Test Error: 
 Accuracy: 77.8%, Avg loss: 0.513321 

Epoch 5
-------------------------------
loss: 0.174923  [  512/ 6851]
Test Error: 
 Accuracy: 78.3%, Avg loss: 0.525511 

Done!


In [113]:
predicated_ans = []

In [114]:
for X, y in test_dataloader:
    X = X.to(device)
    ans_np = torch.argmax(model(X), dim=1).cpu().numpy()
    for n in ans_np:
        predicated_ans.append(n)

In [115]:
ans_df = pd.DataFrame({"id": test_df_raw.id, "target": predicated_ans})
ans_df.to_csv("outputs/ans12.csv", index=False)