In [29]:
import torch
import torch.nn as nn
torch.manual_seed(42)

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from unidecode import unidecode

In [30]:
train_val = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [31]:
train_val.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1


In [32]:
test.head(2)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."


In [33]:
X_train_val = train_val.text.apply(unidecode).to_numpy()
X_test = test.text.apply(unidecode).to_numpy()

In [34]:
y_train_val = train_val.target.to_numpy()

In [35]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42)

In [36]:
cv = CountVectorizer()

In [37]:
X_train = cv.fit_transform(X_train).toarray()
X_val = cv.transform(X_val).toarray()
X_test = cv.transform(X_test).toarray()

In [38]:
print(X_train.shape, X_val.shape, X_test.shape)

print(y_train.shape, y_val.shape)

(6851, 20335) (762, 20335) (3263, 20335)
(6851,) (762,)


In [39]:
X_train = torch.tensor(X_train)
X_val = torch.tensor(X_val)
X_test = torch.tensor(X_test)

y_train = torch.tensor(y_train)
y_val = torch.tensor(y_val)

In [40]:
input_size = X_train.shape[1]
output_size = 2
hidden_size = 100

In [41]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size)

model = RNN(input_size, hidden_size, output_size)
print(model)

RNN(
  (rnn): RNN(20335, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=2, bias=True)
)


In [42]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [43]:
torch.manual_seed(42)

epochs = 5

for epoch in range(epochs):
    hidden = model.init_hidden(1)
    out, hidden = model(X_train.float(), hidden)
    loss = loss_fn(out, y_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"epoch: {epoch+1}, loss={loss.item():.4f}")

epoch: 1, loss=0.6942
epoch: 2, loss=0.6159
epoch: 3, loss=0.4452
epoch: 4, loss=0.3424
epoch: 5, loss=0.2323


In [44]:
torch.manual_seed(42)

hidden = model.init_hidden(1)
val_out, hidden = model(X_val.float(), hidden)

In [45]:
val_ans = torch.argmax(val_out, 1)

In [46]:
val_acc = accuracy_score(y_val, val_ans)*100

print(f"val acc is: {val_acc:.2f}%")

val acc is: 76.90%


In [None]:
hidden = model.init_hidden(1)
test_out, hidden = model(X_test.float(), hidden)

In [None]:
test_ans = torch.argmax(test_out, 1)

In [None]:
ans_df = pd.DataFrame({"id": test.id, "target": test_ans})
ans_df.to_csv("outputs/ans4.csv", index=False)