In [98]:
import numpy as np
import pandas as pd

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline, metrics
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

In [58]:
sample_path = "./data/sample_submission.csv"
train_path = "./data/train.csv"
test_path = "./data/test.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sample = pd.read_csv(sample_path)

print(f"Train length : {len(df_train)}")
print(f"Test length : {len(df_test)}")

Train length : 7613
Test length : 3263


In [59]:
df_train.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [60]:
df_train.location.unique()

array([nan, 'Birmingham', 'Est. September 2012 - Bristol', ...,
       'Vancouver, Canada', 'London ', 'Lincoln'], dtype=object)

In [61]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train, test_size=0.2)
len(df_train), len(df_val)

(6090, 1523)

In [62]:
df_train.target.unique()

array([1, 0])

In [63]:
y_train, x_train = list(df_train.target), list(df_train.text)
y_val, x_val = list(df_val.target), list(df_val.text)

## TFIDF and CountVectorizer

In [71]:
def LogLoss(target, pred, epsilon=1e-5):
    """
    Use sklearn.metrics.log_loss built in function instead
    """
    clip = np.clip(pred, epsilon, 1-epsilon)
    N, M = pred.shape
    y_onehot = np.zeros((N, M))
    for i, val in enumerate(target):
        y_onehot[i, val] = 1
    logLoss = -1/N * np.sum(y_onehot*np.log(clip))
    return logLoss

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode',
                     analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range=(1,3), use_idf=1, smooth_idf=1,
                     sublinear_tf=1, stop_words='english')

tfv.fit(x_train + x_val) # semi-supervised learning
xtrain_tfv = tfv.transform(x_train)
xval_tfv = tfv.transform(x_val)
xtrain_tfv.shape, xval_tfv.shape

((6090, 9229), (1523, 9229))

In [66]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, y_train)
pred_tfv = clf.predict_proba(xval_tfv)
print(f"LogLoss = {LogLoss(y_val, pred_tfv)}")

LogLoss = 0.5041657186736527


In [67]:
ctv = CountVectorizer(analyzer="word", token_pattern=r'\w{1,}',
                      ngram_range=(1,3), stop_words="english")
ctv.fit(x_train+x_val)
xtrain_ctv = ctv.transform(x_train)
xval_ctv = ctv.transform(x_val)

clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, y_train)
pred_ctv = clf.predict_proba(xval_ctv)
print(f"LogLoss = {LogLoss(y_val, pred_ctv)}")

LogLoss = 0.4827469320349906


## XGBoost 

In [90]:
import xgboost as xgb
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, 
                        colsample_bytree=0.8, subsample=0.8,
                        nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv, y_train)
pred_xg = clf.predict_proba(xval_tfv)
print(f"LogLoss = {metrics.log_loss(y_val, pred_xg)}")

LogLoss = 0.4996628081202825


## Grid Search with SVD and Naive Bayes

In [100]:
mll_scorer = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True)

In [101]:
svd = TruncatedSVD()
scl = StandardScaler()
lr_model = LogisticRegression()

clf = pipeline.Pipeline([
        ('svd', svd),
        ('slc', scl),
        ('lr', lr_model),
    ])

param_grid = {
    'svd__n_components' : [120, 180],
    'lr__C' : [0.1, 1.0, 10],
}

model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer, verbose=10, n_jobs=-1, refit=True, cv=2)
model.fit(xtrain_tfv, y_train)

print(f"Best score: {-model.best_score_}")
print("Best parameters set: ")
best_params = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"{param_name} : {best_params[param_name]}")

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: 0.5079954924102028
Best parameters set: 
lr__C : 1.0
svd__n_components : 180
[CV 2/2; 1/6] START nb__alpha=0.001.............................................
[CV 2/2; 1/6] END .............nb__alpha=0.001;, score=-1.196 total time=   0.0s
[CV 1/2; 5/6] START nb__alpha=10................................................
[CV 1/2; 5/6] END ................nb__alpha=10;, score=-0.609 total time=   0.0s
[CV 1/2; 6/6] START nb__alpha=100...............................................
[CV 1/2; 6/6] END ...............nb__alpha=100;, score=-0.672 total time=   0.0s
[CV 2/2; 1/6] START nb__alpha=0.001.............................................
[CV 2/2; 1/6] END .............nb__alpha=0.001;, score=-1.196 total time=   0.0s
[CV 2/2; 5/6] START nb__alpha=10................................................
[CV 2/2; 5/6] END ................nb__alpha=10;, score=-0.611 total time=   0.0s
[CV 2/2; 3/6] START nb__alpha=0.1........

In [97]:
nb_model = MultinomialNB()

clf = pipeline.Pipeline([
        ('nb', nb_model),
    ])

param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer, verbose=10, n_jobs=-1, refit=True, cv=2)
model.fit(xtrain_tfv, y_train)

print(f"Best score: {-model.best_score_}")
print("Best parameters set: ")
best_params = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"{param_name} : {best_params[param_name]}")

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: 0.49063318537754314
Best parameters set: 
nb__alpha : 1


## Word Embeddings

In [103]:
import transformers
print(transformers.__version__)

4.25.1


In [104]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [124]:
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")

train_dataset = load_dataset("csv", data_files=train_path)
test_dataset = load_dataset("csv", data_files=test_path)

Using custom data configuration default-80461f5a0f47ecd9
Found cached dataset csv (/home/hainam/.cache/huggingface/datasets/csv/default-80461f5a0f47ecd9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-2842d9be0ed29a04
Found cached dataset csv (/home/hainam/.cache/huggingface/datasets/csv/default-2842d9be0ed29a04/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

In [126]:
train_dataset["train"][:5]

{'id': [1, 4, 5, 6, 7],
 'keyword': [None, None, None, None, None],
 'location': [None, None, None, None, None],
 'text': ['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
  'Forest fire near La Ronge Sask. Canada',
  "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
  '13,000 people receive #wildfires evacuation orders in California ',
  'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '],
 'target': [1, 1, 1, 1, 1]}

In [134]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

In [143]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)                                                           

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
metric_name = "accuracy"
model_name = model_checkpoint.split("/")[-1]
batch_size = 16


args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
)