In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
tqdm.pandas()

train_df = pd.read_csv('/kaggle/input/amazon-pet-product-reviews-classification/train.csv')
test_df = pd.read_csv('/kaggle/input/amazon-pet-product-reviews-classification/test.csv')
val_df = pd.read_csv('/kaggle/input/amazon-pet-product-reviews-classification/valid.csv')

unlabeled = pd.read_csv('/kaggle/input/amazon-pet-product-reviews-classification/unlabeled.csv')
sample_submission = pd.read_csv('/kaggle/input/amazon-pet-product-reviews-classification/sample_submission.csv')

In [None]:
train_df.shape, test_df.shape, val_df.shape, unlabeled.shape

In [None]:
train_df['label'].value_counts()

In [None]:
val_df['label'].value_counts()

In [None]:
sample_submission.head()

In [None]:
labels_to_ids = {}
ids_to_labels = {}
for i, label in enumerate(sorted(train_df['label'].unique())):
    labels_to_ids[label] = i
    ids_to_labels[i] = label
    
labels_to_ids, ids_to_labels

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression

import re

In [None]:
def generate_features(df_, return_vocab=False):
    
    df = df_.copy()
    stops = stopwords.words('english')
    stemmer = PorterStemmer()
    
    # removing special characters
    df['prepared_text'] = df['text'].progress_apply(lambda text: re.sub('[^A-Za-z]', ' ', text))
    # transform text to lowercase
    df['prepared_text'] = df['prepared_text'].str.lower()
    # tokenize the texts
    df['prepared_text'] = df['prepared_text'].progress_apply(lambda text: word_tokenize(text))
    # removing stopwords
    df['prepared_text'] = df['prepared_text'].progress_apply(lambda words: [word for word in words if word not in stops])
    # stemming
    df['prepared_text'] = df['prepared_text'].progress_apply(lambda words: [stemmer.stem(word) for word in words])
    
    # join prepared_+text to use as corpus
    df['joined_prepared_text'] = df['prepared_text'].progress_apply(lambda words: " ".join(words))
    
    if (return_vocab):
        vocabulary = set(np.concatenate(train_df['prepared_text'].values))
        print(f"There are {len(vocabulary)} words in vocabulary")
        
        return df, vocabulary
    
    return df

In [None]:
train = generate_features(train_df)

In [None]:
val = generate_features(val_df)

In [None]:
test = generate_features(test_df)

In [None]:
corpus = train['joined_prepared_text'].values
corpus[:2]

In [None]:
val_corpus = val['joined_prepared_text'].values
val_corpus[:2]

In [None]:
test_corpus = test['joined_prepared_text'].values
test_corpus[:2]

In [None]:
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(corpus)
X.shape

In [None]:
y = train_df['label'].map(labels_to_ids).values

In [None]:
clf = LogisticRegression(random_state=42, max_iter=200).fit(X, y)

In [None]:
y_pred = clf.predict(X)

In [None]:
labels = list(ids_to_labels.values())

labels

In [None]:
print(classification_report(y, y_pred, target_names=labels))

In [None]:
X_val = vectorizer.transform(val_corpus)
X_val.shape

In [None]:
y_true = val['label'].map(labels_to_ids).values
y_pred = clf.predict(X_val)

In [None]:
print(classification_report(y_true, y_pred, target_names=labels))

In [None]:
test.head()

In [None]:
X_test = vectorizer.transform(test_corpus)
X_test.shape

In [None]:
y_pred = clf.predict(X_test)

In [None]:
sample_submission['label'] = y_pred

In [None]:
sample_submission['label'] = sample_submission['label'].map(ids_to_labels)

In [None]:
sample_submission['label'].value_counts()

In [None]:
sample_submission.to_csv('baseline_submission.csv', index=None, header=True)

In [None]:
from transformers import (AutoModelForSequenceClassification, AdamW, 
                          Trainer, TrainingArguments, PreTrainedTokenizerFast,
                          EarlyStoppingCallback, AutoTokenizer)

import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

from sklearn.metrics import classification_report

In [None]:
train_df = train_df.sample(1000, random_state=42)

In [None]:
val_df = val_df.sample(500, random_state=42)

In [None]:
train_df['label'].value_counts()

In [None]:
val_df['label'].value_counts()

In [None]:
X_train = train_df['text'].tolist()
X_val = val_df['text'].tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
train_tokenized_batch = tokenizer(X_train, truncation=True, max_length=256, padding=True, return_tensors='pt')
val_tokenized_batch = tokenizer(X_val, truncation=True, max_length=256, padding=True, return_tensors='pt')

In [None]:
train_tokenized_batch['labels'] = train_df['label'].map(labels_to_ids).tolist()
val_tokenized_batch['labels'] = val_df['label'].map(labels_to_ids).tolist()

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.tokenized_batch = data

    def __getitem__(self, idx):   
        return {
            'input_ids': self.tokenized_batch['input_ids'][idx],
            'attention_mask': self.tokenized_batch['attention_mask'][idx],
            'labels': self.tokenized_batch['labels'][idx]
        }

    def __len__(self):
        return len(self.tokenized_batch['input_ids'])

In [None]:
train_dataset = Dataset(train_tokenized_batch)
val_dataset = Dataset(val_tokenized_batch)

In [None]:
len(train_dataset), len(val_dataset)

In [None]:
dl_train = DataLoader(train_dataset, batch_size=8, shuffle=False)
dl_val = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=len(ids_to_labels.values()))
_ = model.cuda()

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-6)

In [None]:
epochs = 5
for _ in tqdm(range(epochs), desc="Epoch"):
    tr_loss, nb_tr_steps = 0, 0
    
    for batch in dl_train:
        output = model(input_ids=batch['input_ids'].cuda(), attention_mask=batch['attention_mask'].cuda(), labels=batch['labels'].cuda())

        loss = output.loss
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        
        nb_tr_steps += 1
        tr_loss += loss.item()
        
    print(f"Train Loss: {tr_loss / nb_tr_steps}")
    
    eval_loss, nb_eval_steps = 0, 0
    for batch in dl_val:
        with torch.no_grad():
            output = model(input_ids=batch['input_ids'].cuda(), attention_mask=batch['attention_mask'].cuda(), labels=batch['labels'].cuda())
            
        loss = output.loss
        
        eval_loss += loss.item()
        nb_eval_steps += 1
        
    print(f"Eval loss: {eval_loss / nb_eval_steps}")

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    prediction_loss_only=True,
    logging_dir='./logs',            # directory for storing logs
    seed=42,
    fp16=True,
    save_total_limit=1,
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    learning_rate=5e-6, 
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
trainer.train()

In [None]:
best_model = trainer.model

In [None]:
dl_val = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
predictions = []
true_labels = []
for batch in tqdm(dl_val):
    
    with torch.no_grad():
        output = best_model(input_ids=batch['input_ids'].cuda(), attention_mask=batch['attention_mask'].cuda())
    
    logits = output.logits  
    val_batch_preds = torch.argmax(output.logits, axis=1).cpu().numpy()
    predictions.extend(val_batch_preds)
    true_labels.extend(batch['labels'])

In [None]:
print(classification_report(true_labels, predictions, target_names=labels))

In [None]:
trainer.evaluate()

In [None]:
X_test = test_df['text'].tolist()

In [None]:
test_tokenized_batch = tokenizer(X_test, return_token_type_ids=False, truncation=True, max_length=512, padding=True, return_tensors='pt')

In [None]:
class PredictDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.tokenized_batch = data

    def __getitem__(self, idx):   
        return {
            'input_ids': self.tokenized_batch['input_ids'][idx],
            'attention_mask': self.tokenized_batch['attention_mask'][idx],
        }

    def __len__(self):
        return len(self.tokenized_batch['input_ids'])

In [None]:
predict_dataset = PredictDataset(test_tokenized_batch)

In [None]:
dl = DataLoader(predict_dataset, batch_size=64, shuffle=False)

In [None]:
predictions = []
for batch in tqdm(dl):
    
    with torch.no_grad():
        output = best_model(input_ids=batch['input_ids'].cuda(), attention_mask=batch['attention_mask'].cuda())
    
    logits = output.logits  
    val_batch_preds = torch.argmax(output.logits, axis=1).cpu().numpy()
    predictions.extend(val_batch_preds)

In [None]:
sample_submission['label'] = predictions

In [None]:
sample_submission['label'] = sample_submission['label'].map(ids_to_labels)

In [None]:
sample_submission['label'].value_counts()

In [None]:
sample_submission.to_csv('roberta_submission.csv', index=None, header=True)