In [15]:
import numpy as np
import pandas as pd

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline, metrics
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

## Load Data

In [16]:
sample_path = "./data/sample_submission.csv"
train_path = "./data/train.csv"
test_path = "./data/test.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sample = pd.read_csv(sample_path)

print(f"Train length : {len(df_train)}")
print(f"Test length : {len(df_test)}")

Train length : 7613
Test length : 3263


In [17]:
df_train.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [18]:
df_train.location.unique()

array([nan, 'Birmingham', 'Est. September 2012 - Bristol', ...,
       'Vancouver, Canada', 'London ', 'Lincoln'], dtype=object)

In [19]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train, test_size=0.2)
len(df_train), len(df_val)

(6090, 1523)

In [20]:
df_train.target.unique()

array([1, 0])

In [21]:
y_train, x_train = list(df_train.target), list(df_train.text)
y_val, x_val = list(df_val.target), list(df_val.text)

## TFIDF and CountVectorizer

In [22]:
def LogLoss(target, pred, epsilon=1e-5):
    """
    Use sklearn.metrics.log_loss built in function instead
    """
    clip = np.clip(pred, epsilon, 1-epsilon)
    N, M = pred.shape
    y_onehot = np.zeros((N, M))
    for i, val in enumerate(target):
        y_onehot[i, val] = 1
    logLoss = -1/N * np.sum(y_onehot*np.log(clip))
    return logLoss

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode',
                     analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range=(1,3), use_idf=1, smooth_idf=1,
                     sublinear_tf=1, stop_words='english')

tfv.fit(x_train + x_val) # semi-supervised learning
xtrain_tfv = tfv.transform(x_train)
xval_tfv = tfv.transform(x_val)
xtrain_tfv.shape, xval_tfv.shape

((6090, 9229), (1523, 9229))

In [24]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, y_train)
pred_tfv = clf.predict_proba(xval_tfv)
print(f"LogLoss = {LogLoss(y_val, pred_tfv)}")

LogLoss = 0.49150263459876953


In [25]:
ctv = CountVectorizer(analyzer="word", token_pattern=r'\w{1,}',
                      ngram_range=(1,3), stop_words="english")
ctv.fit(x_train+x_val)
xtrain_ctv = ctv.transform(x_train)
xval_ctv = ctv.transform(x_val)

clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, y_train)
pred_ctv = clf.predict_proba(xval_ctv)
print(f"LogLoss = {LogLoss(y_val, pred_ctv)}")

LogLoss = 0.46888304010401244


## XGBoost 

In [26]:
import xgboost as xgb
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, 
                        colsample_bytree=0.8, subsample=0.8,
                        nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv, y_train)
pred_xg = clf.predict_proba(xval_tfv)
print(f"LogLoss = {metrics.log_loss(y_val, pred_xg)}")

LogLoss = 0.5015601728955994


## Grid Search with SVD and Naive Bayes

In [27]:
mll_scorer = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True)

In [29]:
svd = TruncatedSVD()
scl = StandardScaler()
lr_model = LogisticRegression()

clf = pipeline.Pipeline([
        ('svd', svd),
        ('slc', scl),
        ('lr', lr_model),
    ])

param_grid = {
    'svd__n_components' : [120, 180],
    'lr__C' : [0.1, 1.0, 10],
}

model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer, verbose=10, n_jobs=-1, refit=True, cv=2)
model.fit(xtrain_tfv, y_train)

print(f"Best score: {-model.best_score_}")
print("Best parameters set: ")
best_params = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"{param_name} : {best_params[param_name]}")

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: 0.5101428326321005
Best parameters set: 
lr__C : 0.1
svd__n_components : 180


In [30]:
nb_model = MultinomialNB()

clf = pipeline.Pipeline([
        ('nb', nb_model),
    ])

param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer, verbose=10, n_jobs=-1, refit=True, cv=2)
model.fit(xtrain_tfv, y_train)

print(f"Best score: {-model.best_score_}")
print("Best parameters set: ")
best_params = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"{param_name} : {best_params[param_name]}")

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: 0.4915541852700148
Best parameters set: 
nb__alpha : 1


## BERT embeddings

In [31]:
import torch
from transformers import BertTokenizer, BertModel

import logging
import matplotlib.pyplot as plt

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [32]:
tokenizer.encode_plus

<bound method PreTrainedTokenizerBase.encode_plus of PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})>

In [33]:
%%capture
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
def process(text): 
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    token_vecs = hidden_states[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    return sentence_embedding

In [101]:
%%time
import torch
from tqdm import tqdm
batch_size = 16
train_dataloader = torch.utils.data.DataLoader(x_train, batch_size=batch_size, shuffle=False, num_workers=8)
val_dataloader = torch.utils.data.DataLoader(x_val, batch_size=batch_size, shuffle=False, num_workers=8)
N, M = len(x_train), len(x_val)

def get_bert_embedding(dataloader, size):
    x_bert = []
    for batch_texts in tqdm(dataloader):
        batch_texts = np.array(batch_texts)
        x_bert.extend(list(map(process, batch_texts)))
    x_bert = torch.cat(x_bert, axis=0)
    x_bert = x_bert.reshape(size, -1)
    return x_bert

xtrain_bert = get_bert_embedding(train_dataloader, N)
ytrain_tensor = torch.tensor(y_train, dtype=torch.int64)

xval_bert = get_bert_embedding(val_dataloader, M)
yval_tensor = torch.tensor(y_val, dtype=torch.int64)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 381/381 [05:45<00:00,  1.10it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 96/96 [01:27<00:00,  1.10it/s]

CPU times: user 28min 42s, sys: 2.22 s, total: 28min 44s
Wall time: 7min 12s





In [116]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


learning_rate = 0.01
batch_size = 512
num_epoch = 2


if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

class ClumsyDataset(Dataset):
    def __init__(self, text, target):
        self.text = text
        self.target = target
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        return self.text[idx], self.target[idx]

class Clumsy(nn.Module):
    def __init__(self):
        super(Clumsy, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(768, 300),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(300, 300),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(300, 128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 2),
        )
    def forward(self, x):
        return self.model(x)
    
clumsyModel = Clumsy().to(device)
clumsyTrainDataset = ClumsyDataset(xtrain_bert, ytrain_tensor)
clumsyValDataset = ClumsyDataset(xval_bert, yval_tensor)
train_loader = DataLoader(clumsyTrainDataset)
val_loader = DataLoader(clumsyValDataset)

optimizer = torch.optim.Adam(clumsyModel.parameters(), lr=learning_rate, weight_decay=0.9)
criterion = nn.CrossEntropyLoss()

In [117]:
clumsyModel.train()
for epoch in range(num_epoch):
    for batch_idx, (inp, target) in enumerate(train_loader):
        inp, target = inp.to(device), target.to(device)
        optimizer.zero_grad()
        out = clumsyModel(inp)
        loss = criterion(out, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 1000 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(inp), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))



In [105]:
clumsyModel.eval()
test_loss = 0
with torch.no_grad():
    for batch_idx, (inp, target) in enumerate(val_loader):
        inp, target = inp.to(device), target.to(device)
        out = clumsyModel(inp)
        loss = criterion(out, target)
        test_loss += loss.item()
test_loss /= len(val_loader)        
print('\nTest set: Average loss: {:.4f}'.format(
        test_loss))


Test set: Average loss: 0.7013


In [None]:
# from joblib import Parallel, delayed
# import multiprocessing
# import time

# num_cores = multiprocessing.cpu_count()
# def process(i):
#     return i ** 2

In [None]:
# %%time    
# results = Parallel(n_jobs=3, verbose=5)(delayed(process)(i) for i in range(num))

In [None]:
# %%timeit
# num = 100000
# results = [process(i) for i in range(num)]

In [None]:
# %%timeit
# import numpy as np
# arr = np.array(range(num))
# results = list(map(process, arr))