In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torchvision
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from torch.nn import *
from torch.optim import *
from torchvision.models import *
from sklearn.model_selection import *
from sklearn.metrics import f1_score,accuracy_score,precision_score
import wandb
import nltk
from nltk.stem.porter import *
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn import svm
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import warnings
import os
warnings.filterwarnings("ignore")
PROJECT_NAME = "Natural-Language-Processing-with-Disaster-Tweets"
np.random.seed(55)
stemmer = PorterStemmer()
device = "cpu"
os.environ["CUDA_LAUNCH_BLOCKING"]="1"
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

In [3]:
class Pytorch_Data_Loader:
    def __init__(
        self,
        data: pd.DataFrame = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv"),
        test: pd.DataFrame = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv"),
        all_words: list = [],
        tags: list = [],
    ):
        self.data = data
        self.data = self.data.sample(frac=1)
        self.test = test
        self.X = self.data["text"]
        self.y = self.data["target"]
        self.all_words = all_words
        self.tags = tags

    def tokenize(self, sentence):
        return nltk.word_tokenize(sentence.lower())

    def stem(self, word):
        return stemmer.stem(word.lower())

    def words_to_int(self, words, all_words):
        new_words = []
        for word in words:
            new_words.append(self.stem(word))
        list_of_os = np.zeros(len(all_words))
        for i in range(len(all_words)):
            if all_words[i] in new_words:
                list_of_os[i] = 1.0
        return list_of_os

    def create_all_words(self):
        for x_iter, y_iter in tqdm(zip(self.X, self.y)):
            x_iter = self.tokenize(x_iter)
            new_x_iter = []
            for x_iter_i in x_iter:
                new_x_iter.append(self.stem(x_iter_i))
            self.all_words.extend(new_x_iter)
            self.tags.append(y_iter)
        np.random.shuffle(self.all_words)
        self.all_words = sorted(set(self.all_words))
        self.tags = sorted(set(self.tags))
        return self.all_words, self.tags

    def create(self, test_size=0.0625, shuffle=True):
        self.create_all_words()
        self.new_X = []
        self.new_y = []
        for X_iter, y_iter in tqdm(zip(self.X, self.y)):
            self.new_X.append(self.words_to_int(X_iter, self.all_words))
            self.new_y.append(self.tags.index(y_iter))
        self.X = np.array(self.new_X)
        self.y = np.array(self.new_y)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, shuffle=shuffle
        )
        self.X_train = torch.from_numpy(self.X_train).to(device)
        self.y_train = torch.from_numpy(self.y_train).to(device)
        self.X_test = torch.from_numpy(self.X_test).to(device)
        self.y_test = torch.from_numpy(self.y_test).to(device)
        return (
            self.new_X,
            self.new_y,
            self.X,
            self.y,
            self.X_train,
            self.X_test,
            self.y_train,
            self.y_test,
            self.all_words
        )

    def create_test(self):
        new_test = []
        for X_iter in tqdm(self.test["text"]):
            new_test.append(self.words_to_int(X_iter, self.all_words))
        new_test = torch.from_numpy(np.array(new_test)).to("cpu")
        return new_test

    def create_submission(self, model):
        model.to("cpu")
        preds = model(self.create_test().float())
        ids = self.test["id"]
        submission = {"id": [], "target": []}
        for pred, id in tqdm(zip(preds, ids)):
            submission["id"].append(id)
            submission["target"].append(int(torch.argmax(pred)))
        submission = pd.DataFrame(submission)
        return submission

In [4]:

def accuracy(model, X, y):
    correct = 0
    total = 0
    preds = model(X.float())
    for pred, y_batch in zip(preds, y):
        pred = int(torch.argmax(pred))
        y_batch = int(y_batch)
        if pred == y_batch:
            correct += 1
        total += 1
    acc = round(correct / total, 3) * 100
    return acc
def g_loss(model,X,y,criterion):
    preds = model(X.float())
    loss = criterion(preds.float(), y.long())
    return loss.item()

In [5]:
class Model(Module):
    def __init__(self, input_size, hidden_size=512, num_classes=2):
        super(Model, self).__init__()
        self.l1 = Linear(input_size, hidden_size)
        self.l2 = Linear(hidden_size, hidden_size*2)
        self.l3 = Linear(hidden_size*2, hidden_size*4)
        self.l4 = Linear(hidden_size*4, hidden_size*2)
        self.l5 = Linear(hidden_size*2, num_classes)
        self.relu = ReLU()

    def forward(self, x):
        preds = self.l1(x)
        preds = self.relu(preds)
        preds = self.l2(preds)
        preds = self.relu(preds)
        preds = self.l3(preds)
        preds = self.relu(preds)
        preds = self.l4(preds)
        preds = self.relu(preds)
        preds = self.l5(preds)
        return preds


class Pytorch_Modelling:
    def train(
        self,
        X_train,
        X_test,
        y_train,
        y_test,
        all_words,
        model=Model,
        criterion=CrossEntropyLoss(),  # TODO 
        optimizer=Adam,
        epochs=100,
        batch_size=32,
        name="BaseLine",
    ):
        model = Model(input_size=len(all_words)).to(device)
        optimizer = optimizer(model.parameters(), lr=0.001)
        #wandb.init(project=PROJECT_NAME, name=name)
        #wandb.watch(model, log_freq=10)
        torch.cuda.empty_cache()
        for _ in tqdm(range(epochs)):
            torch.cuda.empty_cache()
            for i in range(0, len(X_train), batch_size):
                torch.cuda.empty_cache()
                try:
                    X_batch = X_train[i : i + batch_size]
                    y_batch = y_train[i : i + batch_size]
                    preds = model(X_batch.float())
                    loss = criterion(preds.float(), y_batch.long())
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                except Exception as e:
                    print(e)
            model.eval()
            #wandb.log(
            #    {
            #        "Val Accuracy": accuracy(model, X_test, y_test),
            #        "Val Loss": g_loss(model, X_test, y_test.long(),criterion),
            #        "Accuracy": accuracy(model, X_train, y_train),
            #        "Loss": g_loss(model, X_train, y_train.long(),criterion),
            #    }
            #)
            model.train()
        # wandb.finish()
        return model

In [6]:
sdl = Pytorch_Data_Loader()
new_X,new_y,X,y,X_train,X_test,y_train,y_test,all_words = sdl.create()
sm = Pytorch_Modelling()
model = sm.train(X_train,X_test,y_train,y_test,all_words, "name")
submission = sdl.create_submission(model)
submission.to_csv("./Pytorch-1.csv", index=False)