In [1]:
import wandb
import nltk
from nltk.stem.porter import *
from torch.nn import *
from torch.optim import *
import numpy as np
import pandas as pd
import torch,torchvision
import random
from tqdm import *
from torch.utils.data import Dataset,DataLoader
stemmer = PorterStemmer()
PROJECT_NAME = 'kickstarter-NLP-V2'
device = 'cuda'

In [2]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence)

In [3]:
tokenize('$1000')

['$', '1000']

In [4]:
def stem(word):
    return stemmer.stem(word.lower())

In [5]:
stem('organic')

'organ'

In [6]:
def bag_of_words(tokenized_words,all_words):
    tokenized_words = [stem(w) for w in tokenized_words]
    bag = np.zeros(len(all_words))
    for idx,w in enumerate(all_words):
        if w in tokenized_words:
            bag[idx] = 1.0
    return bag

In [7]:
bag_of_words(['hi'],['hi','how','hi'])

array([1., 0., 1.])

In [8]:
data = pd.read_csv('./data.csv').dropna()[:5000]

In [9]:
data = data.sample(frac=1)

In [10]:
data

Unnamed: 0.1,Unnamed: 0,blurb,state
4904,4905,Beautiful illustrations tell the story of two ...,failed
4865,4866,The book captures people's thoughts and feelin...,failed
4525,4526,Anicend is an emerging Gaming Tournament organ...,failed
4139,4140,THE QUEST TO SAVE HIP HOP is an old school bea...,failed
106,107,"With help from a leading shark expert, we're u...",failed
...,...,...,...
1957,1958,Super Monster Jam is a game all about heart st...,failed
982,983,I've set a ridiculously ambitious goal of a po...,failed
3681,3682,"Time passes, people move. Like a river's flow,...",failed
746,747,"A sniper, torn unjustly from his post. A fugit...",failed


In [11]:
X = data['blurb']

In [12]:
y = data['state']

In [13]:
words = []
idx = 0
data = []
labels = {}
labels_r = {}

In [14]:
for label in y:
    if label not in list(labels.keys()):
        idx += 1
        labels[label] = idx
        labels_r[idx] = label

In [15]:
labels

{'failed': 1, 'successful': 2}

In [16]:
for X_batch,y_batch in tqdm(zip(X,y)):
    X_batch = tokenize(X_batch)
    new_X = []
    for Xb in X_batch:
        new_X.append(stem(Xb))
    words.extend(new_X)
    data.append([new_X,
                np.eye(labels[y_batch]+1,len(labels))[labels[y_batch]]])

5000it [00:01, 2634.95it/s]


In [17]:
words = sorted(set(words))

In [18]:
np.random.shuffle(data)

In [19]:
X = []
y = []

In [20]:
for sentence,tag in tqdm(data):
    X.append(bag_of_words(sentence,words))
    y.append(tag)

100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 515.50it/s]


In [21]:
from sklearn.model_selection import *
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.125,shuffle=False)
X_train = torch.from_numpy(np.array(X_train)).to(device).float()
y_train = torch.from_numpy(np.array(y_train)).to(device).float()
X_test = torch.from_numpy(np.array(X_test)).to(device).float()
y_test = torch.from_numpy(np.array(y_test)).to(device).float()

In [22]:
def get_loss(model,X,y,criterion):
    preds = model(X)
    loss = criterion(preds,y)
    return loss.item()

In [23]:
def get_accuracy(model,X,y):
    preds = model(X)
    correct = 0
    total = 0
    for pred,yb in zip(preds,y):
        pred = int(torch.argmax(pred))
        yb = int(torch.argmax(yb))
        if pred == yb:
            correct += 1
        total += 1
    acc = round(correct/total,3)*100
    return acc

In [24]:
class Model(Module):
    def __init__(self,hidden):
        super().__init__()
        self.iters = 10
        self.activation = ReLU()
        self.linear1 = Linear(len(words),hidden)
        self.linear2 = Linear(hidden,hidden)
        self.output = Linear(hidden,len(labels))
    
    def forward(self,X):
        preds = self.linear1(X)
        for _ in range(self.iters):
            preds = self.activation(self.linear2(preds))
        preds = self.output(preds)
        return preds

In [25]:
model = Model(256).to(device)
criterion = MSELoss()
optimizer = Adam(model.parameters(),lr=0.001)
epochs = 100
batch_size = 32

In [26]:
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    for i in range(0,len(X_train),batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    torch.cuda.empty_cache()
    wandb.log({'Loss':(get_loss(model,X_train,y_train,criterion)+get_loss(model,X_batch,y_batch,criterion)/2)})
    torch.cuda.empty_cache()
    wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
    torch.cuda.empty_cache()
    wandb.log({'Acc':(get_accuracy(model,X_train,y_train)+get_accuracy(model,X_batch,y_batch))/2})
    torch.cuda.empty_cache()
    wandb.log({'Val Acc':get_accuracy(model,X_test,y_test)})
    torch.cuda.empty_cache()
    model.train()
wandb.finish()
torch.cuda.empty_cache()

[34m[1mwandb[0m: Currently logged in as: [33mranuga-d[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


100%|█████████████████████████████████████████| 100/100 [00:48<00:00,  2.07it/s]


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Loss,0.00051
_runtime,54.0
_timestamp,1633182497.0
_step,399.0
Val Loss,0.05442
Acc,91.25
Val Acc,87.4


0,1
Loss,█▂▁▁▃▂▁▁▁▁▁▁▁▁▁▁▁▁▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_timestamp,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Val Loss,▁█▆▄▂▂▄▄▄▄▄▄▄▄▄▄▃▃▃▆▅▅▅▅▅▅▅▅▅▅▆▆▇▅▆▆▆▇▅▆
Acc,▄▃▂▁▁▁▄▃▁▁▇▇▆▆▆▆▆▄▁▁▄▃▃▂▃▃▅▅██▁▂█▇▁▃▇█▆▃
Val Acc,█▅▅▂▂▂▄▄▃▂▆▇▇▇▇▇▆▅▂▂▄▄▂▃▃▄▄▅▇▇▂▂▇▅▂▃▆▅▆▁


In [27]:
torch.save(model,'model.pt')
torch.save(model,'model.pth')
torch.save(model.state_dict(),'model-sd|.pt')
torch.save(model.state_dict(),'model-sd.pth')
torch.save(X,'X.pt')
torch.save(X,'X.pth')
torch.save(y,'y.pt')
torch.save(y,'y.pth')

In [28]:
torch.save(words,'words.pt')
torch.save(words,'words.pth')
torch.save(data,'data.pt')
torch.save(data,'data.pth')
torch.save(labels,'labels.pt')
torch.save(labels,'labels.pth')

In [29]:
torch.save(idx,'idx.pt')
torch.save(idx,'idx.pth')

In [30]:
torch.save(X_train,'X_train.pt')
torch.save(X_test,'X_test.pth')
torch.save(y_train,'y_train.pt')
torch.save(y_test,'y_test.pth')