# Mounting and Loading

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# %cd /gdrive/MyDrive/Projects/Stylumia
# !unzip stumbleupon.zip
# !rm stumbleupon.zip
# !unzip /gdrive/MyDrive/Projects/Stylumia/raw_content.zip
# !rm /gdrive/MyDrive/Projects/Stylumia/raw_content.zip
# %cd /content/

In [None]:
!pip install torch==1.6.0 

Collecting torch==1.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/5d/5e/35140615fc1f925023f489e71086a9ecc188053d263d3594237281284d82/torch-1.6.0-cp37-cp37m-manylinux1_x86_64.whl (748.8MB)
[K     |████████████████████████████████| 748.8MB 19kB/s 
[31mERROR: torchvision 0.8.2+cu101 has requirement torch==1.7.1, but you'll have torch 1.6.0 which is incompatible.[0m
Installing collected packages: torch
  Found existing installation: torch 1.7.1+cu101
    Uninstalling torch-1.7.1+cu101:
      Successfully uninstalled torch-1.7.1+cu101
Successfully installed torch-1.6.0


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
import nltk
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
stop = stopwords.words('english')

import torch
from torchtext import data
import torch.nn as nn
import torch.optim as optim

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Data

In [3]:
# Reading relevant data from the dataset
df = pd.read_csv('/gdrive/MyDrive/Projects/Stylumia/train.tsv',sep = '\t',usecols=['urlid','boilerplate','label'])
test = pd.read_csv('/gdrive/MyDrive/Projects/Stylumia/test.tsv',sep = '\t',usecols=['urlid','boilerplate'])

In [4]:
print(df.shape)
print(test.shape)

(7395, 3)
(3171, 2)


In [113]:
test.head()

Unnamed: 0,urlid,boilerplate
0,5865,homemade enchilada sauce lynn kitchen adventur...
1,782,lolpics stun grenade ar funny picture lolpicss...
2,6962,treadmill treadmill stair climber treadmill xc...
3,7640,father tactic used assad crush revolt threaten...
4,3589,stem turn lemon lime juicy atomizer gadget lab...


# Cleaning Text

In [6]:
# Removing title, url and {} brackets
df['boilerplate'].replace(to_replace=r'"title":', value="",inplace=True,regex=True)
df['boilerplate'].replace(to_replace=r'"url":',value="",inplace=True,regex=True)

df['boilerplate'].replace(to_replace=r'{|}',value="",inplace=True,regex=True)
df['boilerplate']=df['boilerplate'].str.lower()

test['boilerplate'].replace(to_replace=r'"title":', value="",inplace=True,regex=True)
test['boilerplate'].replace(to_replace=r'"url":',value="",inplace=True,regex=True)

test['boilerplate'].replace(to_replace=r'{|}',value="",inplace=True,regex=True)
test['boilerplate']=test['boilerplate'].str.lower()

In [7]:
df.head()

Unnamed: 0,urlid,boilerplate,label
0,4042,"""ibm sees holographic calls air breathing batt...",0
1,8471,"""the fully electronic futuristic starting gun ...",1
2,1164,"""fruits that fight the flu fruits that fight t...",1
3,6684,"""10 foolproof tips for better sleep "",""body"":""...",1
4,9006,"""the 50 coolest jerseys you didn t know existe...",0


In [90]:
def clean(df,col):
  # Regex Processing
  def reg(text):
      text = text.replace('''"body"''',' ')
      text = text.replace(''',''',' ')
      text = re.sub(r'@\w+|rt\s@\w+|https?:\/\/.*[\r]*|#\w+|[^\w\s]|[0-9]*|', '', str(text).lower().strip())
      text = re.sub('[ \t]+' , ' ', str(text))
      return text

  # Removing StopWords
  def stopWordRemoval(x):
      x = ' '.join([word for word in str(x).split() if word not in (stop)])
      return x
      
  # Lemmating tweets
  w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
  lemmatizer = nltk.stem.WordNetLemmatizer()
  def lemmatize_text(text):
      return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

  df[col] = df[col].apply(reg)
  df[col] = df[col].apply(stopWordRemoval)
  df[col] = df[col].apply(lemmatize_text)
  return df

# Cleaning, Removing Stop Words, Tokenizing and Lemmatizing text
df = clean(df,'boilerplate')
test = clean(test,'boilerplate')
df.head(10)

KeyboardInterrupt: ignored

In [96]:
(df.iloc[99,:])

urlid                                                       6153
boilerplate    worst athlete sport history worst athlete spor...
label                                                          1
Name: 99, dtype: object

In [10]:
test.iloc[1:11]

Unnamed: 0,urlid,boilerplate
1,782,lolpics stun grenade ar funny picture lolpicss...
2,6962,treadmill treadmill stair climber treadmill xc...
3,7640,father tactic used assad crush revolt threaten...
4,3589,stem turn lemon lime juicy atomizer gadget lab...
5,6719,denmark aim improve health fat tax fat tax den...
6,3905,loud snoring related silent stroke snoring unp...
7,9841,sweet potato ravioli lemon sage brown butter s...
8,7447,offbeat funny news design art graffiti funny h...
9,4776,fell asleep watching dvd fell asleep watching ...
10,2879,marthastewart lighter sesame noodle


In [11]:
# df.drop(columns='urlid',inplace=True)
# test.drop(columns='urlid',inplace=True)

# Tensorizing Data

In [12]:
# Spliting into Train and Validation set
train, val  = train_test_split(df,random_state = 42,test_size = 0.2)
print(train.shape,val.shape)

(5916, 3) (1479, 3)


In [13]:
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [14]:
# Creating Dataset class 
class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.label if not is_test else None
            text = row.boilerplate
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [15]:
# Creating torch Datasets 
fields = [('text',TEXT),('label',LABEL)]
train_ds, val_ds, test_ds = DataFrameDataset.splits(fields, train_df=train, val_df=val, test_df=test)

In [16]:
# Lets look at a random example
print(test_ds[0].text)

# Check the type 
print(type(train_ds[15]))

['homemade', 'enchilada', 'sauce', 'lynn', 'kitchen', 'adventure', 'usually', 'buy', 'enchilada', 'sauce', 'yes', 'knew', 'making', 'never', 'found', 'recipe', 'really', 'happy', 'tried', 'several', 'good', 'stuck', 'canned', 'stuff', 'get', 'grocery', 'store', 'recently', 'talking', 'friend', 'mine', 'lived', 'mexico', 'year', 'know', 'mexican', 'cooking', 'asked', 'made', 'enchilada', 'sauce', 'told', 'basic', 'gave', 'exact', 'recipe', 'decided', 'give', 'try', 'recipe', 'really', 'good', 'best', 'enchilada', 'sauce', 'made', 'great', 'flavor', 'think', 'even', 'better', 'canned', 'sauce', 'husband', 'thought', 'could', 'spicier', 'like', 'enchilada', 'spicy', 'always', 'add', 'chili', 'powder', 'chilies', 'like', 'really', 'spicy', 'kid', 'thought', 'really', 'good', 'like', 'change', 'two', 'thing', 'called', 'green', 'onion', 'used', 'regular', 'onion', 'thought', 'worked', 'great', 'probably', 'continue', 'make', 'way', 'also', 'pureed', 'everything', 'blender', 'wanted', 'smoot

In [17]:
# Loading Glove embeddings and building vocab
MAX_VOCAB_SIZE = 50000

TEXT.build_vocab(train_ds, 
                 vectors = 'glove.42B.300d',
                 max_size = MAX_VOCAB_SIZE,
                 unk_init = torch.Tensor.zero_)

.vector_cache/glove.42B.300d.zip: 1.88GB [05:54, 5.30MB/s]                            
100%|█████████▉| 1916531/1917494 [03:30<00:00, 9931.41it/s]

In [18]:
LABEL.build_vocab(train_ds)

In [121]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] 
PAD_IDX

1

# Model

## Architecture 

In [25]:
# Creating a pytorch BiLSTM Model
class Model(nn.Module):
    def __init__(self, num_emb, emb_dim, pad_idx, hi_s,out_s, num_la, bidir, drop):
        super().__init__()
        self.bidir = bidir

        self.emb =nn.Embedding(num_embeddings=num_emb,
                               embedding_dim=emb_dim,
                               padding_idx=pad_idx)
        self.lstm =nn.LSTM(input_size=emb_dim,
                           hidden_size=hi_s,
                           num_layers=num_la,
                           bidirectional=bidir,
                           dropout = drop)
        if bidir:
          self.fc_bi = nn.Linear(2*hi_s,hi_s)

        self.fc_class = nn.Linear(hi_s,out_s)
        self.dropout = nn.Dropout(drop)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,text,text_len):

        embed = self.emb(text)
        pack = nn.utils.rnn.pack_padded_sequence(embed, text_len)
        out, (hid,cell)  = self.lstm(pack)

        if self.bidir:
          hid = self.dropout(torch.cat((hid[-2,:,:], hid[-1,:,:]), dim = 1))
          hid = self.fc_bi(hid)

        out = self.fc_class(hid)
        out = torch.squeeze(out)
        out = self.sigmoid(out)

        return out

## Inference

In [26]:
# Hyperparameters
INPUT_DIM = 50002
EMBED_DIM = 300
HID_DIM = 128
OUT_DIM = 1
N_LAYERS = 4
BIDIR = True
DROP = 0.3
PAD_IDX = 1

# Initialising Model
net = Model(num_emb = INPUT_DIM,
            emb_dim = EMBED_DIM,
            pad_idx = PAD_IDX,
            out_s = OUT_DIM,
            hi_s = HID_DIM,
            num_la = N_LAYERS,
            bidir = BIDIR,
            drop = DROP)
net.load_state_dict(torch.load("/gdrive/MyDrive/Projects/Stylumia/bestModel.pt"))
print(net)

Model(
  (emb): Embedding(50002, 300, padding_idx=1)
  (lstm): LSTM(300, 128, num_layers=4, dropout=0.3, bidirectional=True)
  (fc_bi): Linear(in_features=256, out_features=128, bias=True)
  (fc_class): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (sigmoid): Sigmoid()
)


In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [42]:
net.to(device)

Model(
  (emb): Embedding(50002, 300, padding_idx=1)
  (lstm): LSTM(300, 128, num_layers=4, dropout=0.3, bidirectional=True)
  (fc_bi): Linear(in_features=256, out_features=128, bias=True)
  (fc_class): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (sigmoid): Sigmoid()
)

In [105]:
#inference 
import spacy
nlp = spacy.load('en')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                   #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1)                            #reshape in form of batch,no. of words
    length_tensor = torch.Tensor(length)               #convert to tensor
    model.eval()
    prediction = model(tensor.to(device), length_tensor)                  #prediction 
    return int(torch.round(torch.sigmoid(prediction)).item())

In [111]:
result = []
for i in range(len(test)):
    _res = predict(net,test.boilerplate[i])
    _urlid = test.urlid[i]
    result.append([_urlid,_res])

In [118]:
submit = pd.DataFrame(result,columns=['urlid','predicted'])

In [120]:
submit.to_csv('/gdrive/MyDrive/Projects/Stylumia/submit.csv')