In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
TARGET_COLUMN = 'harsh'

# Load Dataset

In [None]:
import pandas as pd

train_df = pd.read_csv('/kaggle/input/harshdetection/train.csv')
print(f'Train Shape: {train_df.shape}')

In [None]:
train_df.head()

# Imbalanced Data

In [None]:
for col in train_df.columns:
    if col == 'id' or col == 'text': continue
    print(f'__{col}__')
    print(train_df[col].value_counts())

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

def over_sample(label, permission=0):
    over_sampler = RandomOverSampler(random_state=0)
    under_sampler = RandomUnderSampler(random_state=0)
    
    X_sampled = train_df['text']
    y_sampled = train_df[label]
    if permission == 1: 
        X_sampled, y_sampled = over_sampler.fit_resample(X_sampled, y_sampled)
    elif permission == 2:
        X_sampled, y_sampled = under_sampler.fit_resample(X_sampled, y_sampled)
    
    return X_sampled, y_sampled

In [None]:
train_X_dict = {}
train_y_dict = {}

for col in train_df.columns:
    if col == 'id' or col == 'text':
        continue
    
    train_X_dict[col], train_y_dict[col] = over_sample(col)


# Torch Dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(train_X_dict[TARGET_COLUMN], train_y_dict[TARGET_COLUMN], test_size=0.2, random_state=42)

train_dataset = {
    'text': X_train,
    'label': y_train
}

cv_dataset = {
    'text' : X_cv,
    'label' : y_cv
}

In [None]:
from datasets import Dataset
from datasets import DatasetDict

dataset = DatasetDict({
    'train': Dataset.from_dict(train_dataset),
    'cv': Dataset.from_dict(cv_dataset)
})

# BERT Tokenizer

In [None]:
BERT_MODEL = 'bert-base-uncased'

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

def preprocess(dataset):
    return tokenizer(dataset["text"], truncation=True)

In [None]:
tokenized_dataset = dataset.map(preprocess, batched=True)

# Data Collator

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Eval Metrics

In [None]:
!pip install evaluate

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# BERT Classifier

In [None]:
id2label = {0: f"NOT_{TAREGT_COLUMN}", 1: f"{TARGET_COLUMN}"}
label2id = {f"NOT_{TAREGT_COLUMN}": 0, f"{TARGET_COLUMN}": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=2, id2label=id2label, label2id=label2id
)

# model.to(device)

In [None]:
training_args = TrainingArguments(
    output_dir=f'/kaggle/working/BERT-BASE-{TARGET_COLUMN}'
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["cv"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# text = 'Sometimes, growth comes from embracing the uncomfortable edges of feedback.'

# from transformers import pipeline

# classifier = pipeline("sentiment-analysis", model=f"/kaggle/working/BERT-BASE-{TAREGT_COLUMN}")
# classifier(text)

In [None]:
# !zip -r 'bert-for-harsh.zip' '/kaggle/working/BERT-BASE-HARSH/checkpoint-8936'