In [None]:
import pandas as pd
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizerFast
import torch
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from transformers import pipeline

In [None]:
train_path = '../dataset/train.csv'

In [None]:
train = pd.read_csv(train_path, delimiter=',',escapechar='\\',header=0,on_bad_lines='skip', encoding='utf-8')

In [None]:
threshold = 10
value_counts = train['target'].value_counts()
train = train[train['target'].isin(value_counts[value_counts >= threshold].index)]

In [None]:
train_1 = train.groupby('target').apply(lambda x: x.sample(16000)).reset_index(drop=True)

In [None]:
train_1 = train_1.sample(frac=1)

In [None]:
import re

In [None]:
def remove_patterns(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

In [None]:
train_1['text'] = train_1['text'].apply(remove_patterns)

In [None]:
train_1.head()

In [None]:
train_1.shape

In [None]:
lables = train['target'].unique().tolist()
labeles = [s.strip() for s in lables]

In [None]:
NUM_LABELS = len(lables)

id2labels={id:label for id, label in enumerate(lables)}
labels2id={label:id for id, label in enumerate(lables)}

In [None]:
id2labels

In [None]:
train_1['labels']= train_1.target.map(lambda x: labels2id[x.strip()])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity", num_labels=NUM_LABELS, ignore_mismatched_sizes=True)

In [None]:
model.id2labels = id2labels
model.labels2id = labels2id

In [None]:
import torch
gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(gpu)

In [None]:
torch.cuda.set_per_process_memory_fraction(0.8)

In [None]:
model.to(gpu)

In [None]:
train_1.shape

In [None]:
train_1 = train_1.sample(frac=1)
train_1 = train_1.sample(frac=1)

In [None]:
train_3 = train_1[:300000].groupby('target').apply(lambda x: x.sample(10000)).reset_index(drop=True)

In [None]:
X_train = pd.DataFrame(train_3[['text']])
Y_train = pd.DataFrame(train_3['labels'])

In [None]:
train_2 = train_1[300000:350000].groupby('target').apply(lambda x: x.sample(200)).reset_index(drop=True)

In [None]:
X_valid = pd.DataFrame(train_2[['text']])
Y_valid = pd.DataFrame(train_2['labels'])

In [None]:
X_test = pd.DataFrame(train_1['text'][390000:])
Y_test = pd.DataFrame(train_1['labels'][390000:])

In [None]:
X_train = X_train['text'].tolist()
X_valid = X_valid['text'].tolist()
X_test = X_test['text'].tolist()

In [None]:
Y_train = Y_train['labels'].tolist()
Y_valid = Y_valid['labels'].tolist()
Y_test = Y_test['labels'].tolist()

In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_valid, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
class DataLoader(Dataset):
    def __init__(self, encoding, labels):
        self.encoding = encoding
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])  for key, val in self.encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataloader = DataLoader(train_encodings, Y_train)
val_dataloader = DataLoader(val_encodings, Y_valid)
test_dataloader = DataLoader(test_encodings, Y_test)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ =  precision_recall_fscore_support(labels, preds, average='macro')
    acc =  accuracy_score(labels, preds)

    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


In [None]:
training_args = TrainingArguments(
    output_dir='/results',
    do_train=True,
    do_eval=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,

    warmup_steps=100,
    weight_decay=0.0001,
    logging_strategy='steps',
    learning_rate= 0.00006,

    logging_dir='/logs',
    logging_steps=500,
    evaluation_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model = model,

    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
q=[trainer.evaluate(eval_dataset=df_org) for df_org in [train_dataloader, val_dataloader, test_dataloader]]
pd.DataFrame(q, index=["train", "val", "test"]).iloc[:,:5]

In [None]:
def predict(text):
    texts = tokenizer(text, padding=True,truncation=True, max_length=512, return_tensors="pt")

    outputs = model(**texts)

    probs = outputs[0].softmax(1)
    pred_label_idx = probs.argmax()

    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label

In [None]:
model_path = "model/1model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

## Predicting

In [None]:
torch.cuda.set_per_process_memory_fraction(0.9)

In [None]:
model_path = "model/1model"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
nlp = pipeline("feature-extraction", model=model, tokenizer=tokenizer, device=gpu)

In [None]:
test_path = '../dataset/test.csv'

In [None]:
test = pd.read_csv(test_path)

In [None]:
test['text'] = test['text'].apply(remove_patterns)

In [None]:
test.drop('Word Count', axis=1, inplace=True)

In [None]:
A = pd.DataFrame(test['text'])
B = pd.DataFrame(test['Index'])

In [None]:
A = A['text'].tolist()

In [None]:
values=nlp(A[:], truncation=True, padding=True)

In [None]:
import torch

# Convert the list of predictions to a tensor
values_tensor = torch.tensor(values)
pred = []

for i in range(len(A)):
    # Get the index of the maximum value for the current example
    max_index = values_tensor[i].argmax().item()  # .item() to get a Python number from a tensor
    pred.append(id2labels[max_index])

In [None]:
pred_df = pd.DataFrame(pred, columns=["target"])

In [None]:
pred_df['target'] = pred_df['target'].apply(str)

In [None]:
pred_df.shape

In [None]:
pred_df.head()

In [None]:
pred_df['target'].value_counts()

In [None]:
B.shape

In [None]:
pred_df['Index'] = B['Index']

In [None]:
pred_df.to_csv('submission.csv', index=False)