In [146]:
import numpy as np
import pandas as pd

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline, metrics
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [124]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [125]:
sample_path = "./data/sample_submission.csv"
train_path = "./data/train.csv"
test_path = "./data/test.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sample = pd.read_csv(sample_path)

print(f"Train length : {len(df_train)}")
print(f"Test length : {len(df_test)}")

Train length : 7613
Test length : 3263


In [126]:
df_train.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [127]:
df_train.location.unique()

array([nan, 'Birmingham', 'Est. September 2012 - Bristol', ...,
       'Vancouver, Canada', 'London ', 'Lincoln'], dtype=object)

In [128]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train, test_size=0.2)
len(df_train), len(df_val)

(6090, 1523)

In [129]:
df_train.target.unique()

array([1, 0])

In [130]:
y_train, x_train = list(df_train.target), list(df_train.text)
y_val, x_val = list(df_val.target), list(df_val.text)

## TFIDF and CountVectorizer

In [131]:
def LogLoss(target, pred, epsilon=1e-5):
    """
    Use sklearn.metrics.log_loss built in function instead
    """
    clip = np.clip(pred, epsilon, 1-epsilon)
    N, M = pred.shape
    y_onehot = np.zeros((N, M))
    for i, val in enumerate(target):
        y_onehot[i, val] = 1
    logLoss = -1/N * np.sum(y_onehot*np.log(clip))
    return logLoss

In [132]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode',
                     analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range=(1,3), use_idf=1, smooth_idf=1,
                     sublinear_tf=1, stop_words='english')

tfv.fit(x_train + x_val) # semi-supervised learning
xtrain_tfv = tfv.transform(x_train)
xval_tfv = tfv.transform(x_val)
xtrain_tfv.shape, xval_tfv.shape

((6090, 9229), (1523, 9229))

In [133]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, y_train)
pred_tfv = clf.predict_proba(xval_tfv)
print(f"LogLoss = {LogLoss(y_val, pred_tfv)}")

LogLoss = 0.4941405696220364


In [134]:
ctv = CountVectorizer(analyzer="word", token_pattern=r'\w{1,}',
                      ngram_range=(1,3), stop_words="english")
ctv.fit(x_train+x_val)
xtrain_ctv = ctv.transform(x_train)
xval_ctv = ctv.transform(x_val)

clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, y_train)
pred_ctv = clf.predict_proba(xval_ctv)
print(f"LogLoss = {LogLoss(y_val, pred_ctv)}")

LogLoss = 0.4630015513684077


## XGBoost 

In [135]:
import xgboost as xgb
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, 
                        colsample_bytree=0.8, subsample=0.8,
                        nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv, y_train)
pred_xg = clf.predict_proba(xval_tfv)
print(f"LogLoss = {metrics.log_loss(y_val, pred_xg)}")

LogLoss = 0.48975354981090746


## Grid Search with SVD and Naive Bayes

In [136]:
mll_scorer = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True)

In [137]:
svd = TruncatedSVD()
scl = StandardScaler()
lr_model = LogisticRegression()

clf = pipeline.Pipeline([
        ('svd', svd),
        ('slc', scl),
        ('lr', lr_model),
    ])

param_grid = {
    'svd__n_components' : [120, 180],
    'lr__C' : [0.1, 1.0, 10],
}

model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer, verbose=10, n_jobs=-1, refit=True, cv=2)
model.fit(xtrain_tfv, y_train)

print(f"Best score: {-model.best_score_}")
print("Best parameters set: ")
best_params = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"{param_name} : {best_params[param_name]}")

Fitting 2 folds for each of 6 candidates, totalling 12 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already 

In [138]:
nb_model = MultinomialNB()

clf = pipeline.Pipeline([
        ('nb', nb_model),
    ])

param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer, verbose=10, n_jobs=-1, refit=True, cv=2)
model.fit(xtrain_tfv, y_train)

print(f"Best score: {-model.best_score_}")
print("Best parameters set: ")
best_params = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"{param_name} : {best_params[param_name]}")

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: 0.49018914761693233
Best parameters set: 
nb__alpha : 1


## Try to fine tuned BERT ---> failed

In [139]:
import transformers
print(transformers.__version__)

4.25.1


In [168]:
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")

train_dataset = load_dataset("csv", data_files=train_path)
test_dataset = load_dataset("csv", data_files=test_path)

Using custom data configuration default-80461f5a0f47ecd9
Found cached dataset csv (/home/hainam/.cache/huggingface/datasets/csv/default-80461f5a0f47ecd9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-2842d9be0ed29a04
Found cached dataset csv (/home/hainam/.cache/huggingface/datasets/csv/default-2842d9be0ed29a04/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

In [169]:
train_dataset["train"] = train_dataset["train"].remove_columns(['id', 'keyword', 'location'])
train_dataset["train"] = train_dataset["train"].rename_column("target", "label")

train_val_dataset = train_dataset['train'].train_test_split(test_size=0.2)

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)                                                           

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

encoded_dataset = train_val_dataset.map(preprocess_function, batched=True)
encoded_dataset

In [176]:
from datasets import load_metric
f1_metric = load_metric("f1")
metric_name = "f1"

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

model_name = model_checkpoint.split("/")[-1]
batch_size = 16

args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    label_names=["label"],
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
