In [1]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset
import wandb
import shutil
from torch.utils.data import WeightedRandomSampler
import os
from sklearn.metrics import balanced_accuracy_score
import emoji
import ast
import re

In [2]:
## Incarcare dataset
df = pd.read_csv('/kaggle/input/custom/train_data.csv')
df.head(55)

Unnamed: 0,Text,Final Labels,Id
0,@CorinaTomescu05 Încă nu ...dar am trecut prin...,non-offensive,0
1,@emosaphicbitch sau rosu ca mine,non-offensive,1
2,@DanaMinodora Ce frumoasa ești.. Arăți foarte ...,non-offensive,2
3,Din fericire în extaz!Ai dus covoareleeee?? Pu...,offensive,3
4,cand aveam vreo 5 ani credeam ca romana e sing...,non-offensive,4
5,@ALETTAOCEANXXXX Ce prințesă frumoasă și sexxx...,direct,5
6,@sandman_II @i0n1ca Ionica din clipa când te-a...,direct,6
7,"Coronavirus România, 29 iunie 2021. 294 de dec...",non-offensive,7
8,@boobingheluvr te rog eu mai ieși din casa,offensive,8
9,@alexbadea9 Corpul meu e pe modul toamnă. Mă i...,non-offensive,9


In [3]:
def text_without_emojis():
    text_noemoji = []
    for i in df.index:
        line = df['Text'].iloc[i]
        emojis = emoji.emoji_list(line)
        for emoji_dict in emojis:
            emoji_str = emoji_dict.get('emoji')
            line = re.sub(emoji_str, '', line)
        text_noemoji.append(line)
    return text_noemoji

In [4]:
def text_without_mentions():
    text_nomentions = [re.sub(r'@\S+', '', line) for line in df['Text']]
    return text_nomentions

In [5]:
def weird_characters():
    chars = {character for line in df['Text'] for character in line if not re.search(r'[a-zA-Z0-9șțîâăȘȚÂĂÎ\s_-]+', character)}
    return chars

In [6]:
def check_with_dictionary(file_name):
    text_cleaned = []
    with open(file_name, "r", encoding='UTF-8') as data:
        dictionary = ast.literal_eval(data.read())
        # print(dictionary)
        # print(type(dictionary))
        for line in df['Text']:
            for key in dictionary:
                if key != dictionary[key]:
                    while line.find(key) != -1:
                        line = line.replace(key, dictionary[key])
            text_cleaned.append(line)
    return text_cleaned

In [7]:
def clean_non_ascii():
    text_cleaned = []
    for line in df['Text']:
        tokens = word_tokenize(line)
        clean_tokens = [token for token in tokens if token.isascii() or re.search(r'[/SșțîâăȘȚÂĂÎ/S]',token)]
        clean_text = " ".join(clean_tokens)
        text_cleaned.append(clean_text)
    return text_cleaned

In [8]:
def clean_non_alphanumerical():
    text_cleaned = []
    dictionary = weird_characters()
    for line in df['Text']:
        for elem in dictionary:
            while line.find(elem) != -1:
                line = line.replace(elem, '')
        text_cleaned.append(line)
    return text_cleaned

In [9]:
def lowercase_text():
    text_lowercase = []
    for line in df['Text']:
        line = line.replace('_', ' ')
        if not re.search(r'[ȘȚÂĂÎ]+', line):
            line = line.lower()
        else:
            line = line.replace("Ț", "ț").replace("Ș", "ș").replace("Î", "î").replace("Ă", "ă").replace("Â", "â")
        text_lowercase.append(line)

    return text_lowercase

In [10]:
df['Text'] = text_without_emojis()
df['Text'] = text_without_mentions()
# df['Text'] = check_with_dictionary('character map/full_replace.txt')
# df['Text'] = clean_non_ascii()
# df['Text'] = clean_non_alphanumerical()
# df['Text'] = lowercase_text()

df.to_csv('/kaggle/working/train_data_noemoji_nomentions.csv', index=False)

In [11]:
## Testare dataset
#df = pd.read_csv('dataset/train_data_noemoji.csv')
#df = pd.read_csv('dataset/train_data_noemoji_nomentions.csv')
df.head(10)

Unnamed: 0,Text,Final Labels,Id
0,Încă nu ...dar am trecut prin...28.... să fii...,non-offensive,0
1,sau rosu ca mine,non-offensive,1
2,Ce frumoasa ești.. Arăți foarte bine frumoaso,non-offensive,2
3,Din fericire în extaz!Ai dus covoareleeee?? Pu...,offensive,3
4,cand aveam vreo 5 ani credeam ca romana e sing...,non-offensive,4
5,Ce prințesă frumoasă și sexxxxyyy ești,direct,5
6,Ionica din clipa când te-am văzut pentru pri...,direct,6
7,"Coronavirus România, 29 iunie 2021. 294 de dec...",non-offensive,7
8,te rog eu mai ieși din casa,offensive,8
9,Corpul meu e pe modul toamnă. Mă ia somnul de...,non-offensive,9


In [12]:
class SexismDetectionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes,
                                                                        return_dict=False)

    def forward(self, x):
        return self.bert_model(x)

In [13]:
class SexismDataset(Dataset):

    def __init__(self, model_name,  file, test=False):
        data = pd.read_csv(file)
        self.text = data["Text"].tolist()
        self.label_mapping = {
            'direct': 0,
            'descriptive': 1,
            'reporting': 2,
            'offensive': 3,
            'non-offensive': 4
        }
        self.padding = 64
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.test = test
        self.maximum = 0
        self.count = 0
        if not test:
            self.labels = data["Final Labels"].tolist()

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):

         
        input_ids = self.tokenizer.encode(self.text[item], add_special_tokens=True, return_tensors="pt",
                                          max_length=self.padding,
                                          padding='max_length',
                                          truncation=True).view(-1)
        if not self.test:
            return input_ids, self.label_mapping[self.labels[item]]
        return input_ids

In [14]:
def save_ckp(state, is_best, checkpoint_dir, best_model_dir, num=None):
    if num is not None:  
        f_path = checkpoint_dir + f'/checkpoint{num}.pt'
    else:
        f_path = checkpoint_dir + f'/checkpoint.pt'
    torch.save(state, f_path)
    if is_best:
        if num is not None:  
            best_fpath = best_model_dir + f'/best_model{num}.pt'
        else:
            best_fpath = best_model_dir + f'/best_model.pt'
        shutil.copyfile(f_path, best_fpath)


def load_ckp(checkpoint_fpath, model, optimizer):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    return model, optimizer, checkpoint['epoch'], checkpoint['best_acc']


def get_weighted_sampler(dataset):
    class_indices = {}
    for i in range(len(dataset)):
        _, label = dataset[i]
        if label not in class_indices:
            class_indices[label] = []
        class_indices[label].append(i)

    class_weights = {}
    for label in class_indices:
        class_weights[label] = 1 / len(class_indices[label])

    weights = [class_weights[label] for _, label in dataset]
    oversample_sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

    return oversample_sampler


class ModelTrainer:

    def __init__(self, device, model, criterion, optimizer, dataset, validation_split=.2,
                 batch_size=32, shuffle_dataset=True,
                 random_seed=42, resume_from_checkpoint=False, project_name="Training", architecture="Unknown",
                 num_epochs=25, initial_lr=None, num_classes=None, weighted_sampler=False):

        os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
        os.environ['TORCH_USE_CUDA_DSA'] = '1'

        if resume_from_checkpoint:
            self.model, self.optimizer, self.start_epoch, self.best_acc = load_ckp(
                'checkpoint/checkpoint.pt',
                model, optimizer)
        else:
            self.model = model
            self.optimizer = optimizer
            self.start_epoch = 0
            self.best_acc = 0.0

        self.model.to(device)
        self.criterion = criterion

        self.dataset = dataset
        self.shuffle_dataset = shuffle_dataset
        self.random_seed = random_seed
        self.dataset_size = len(dataset)
        self.validation_split = validation_split
        self.device = device
        self.batch_size = batch_size
        self.lr_scheduler = None
        self.epochs = num_epochs
        self.nr_batch_report = 5
        self.num_classes = num_classes

        test_size = int(self.validation_split * len(dataset))
        train_size = len(dataset)
        train_dataset = dataset
        _, test_dataset = torch.utils.data.random_split(dataset, [len(dataset) - test_size, test_size])

        if not weighted_sampler:
            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                                       num_workers=0, drop_last=True)
        else:
            weighted_sampler = get_weighted_sampler(train_dataset)
            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                                       sampler=weighted_sampler,
                                                       num_workers=0, drop_last=True)

        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
                                                  num_workers=0, drop_last=True)

        self.dataloaders = {'train': train_loader,
                            'val': test_loader}

        self.dataset_sizes = {"train": train_size, "val": test_size}

        torch.backends.cudnn.benchmark = True

        torch.backends.cuda.matmul.allow_tf32 = True

        # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
        torch.backends.cudnn.allow_tf32 = True

        wandb.init(
            # set the wandb project where this run will be logged
            project=project_name,

            # track hyper parameters and run metadata
            config={
                "architecture": architecture,
                "epochs": num_epochs,
                "nr_batch_report": self.nr_batch_report,
                "initial_lr": initial_lr if initial_lr else "Unknown",
                "batch_size": self.batch_size
            }
        )

        wandb.log({"train_size": train_size, "test_size": test_size})

    def set_lr_scheduler(self, scheduler):
        self.lr_scheduler = scheduler

    def train_model(self):
        scaler = torch.cuda.amp.GradScaler()
        
        
        for epoch in range(self.start_epoch, self.epochs):
            # Each epoch has a training and validation phase
            wandb.log({
                "epoch": epoch
            })
            for phase in ['train', 'val']:
                torch.cuda.empty_cache()
                if phase == 'train':
                    self.model.train()  # Set model to training mode
                else:
                    self.model.eval()  # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                y_true = []
                y_pred = []
                
                for batch_idx, (inputs, labels) in enumerate(self.dataloaders[phase]):
                    inputs = inputs.to(self.device)
                    labels = torch.as_tensor(labels).to(self.device)

                    # zero the parameter gradients
                    self.optimizer.zero_grad()

                    # forward
                    with torch.cuda.amp.autocast():
                        with torch.set_grad_enabled(phase == 'train'):
                            outputs = self.model(inputs)
                        if type(outputs) is tuple:
                            outputs = outputs[0]
                        loss = self.criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        scaler.scale(loss).backward()
                        scaler.step(self.optimizer)
                        scaler.update()

                        if batch_idx % self.nr_batch_report == 0:
                            wandb.log({"loss": loss.item()})
                            if self.lr_scheduler:
                                wandb.log({"lr": self.lr_scheduler.get_last_lr()[-1]})
                                self.lr_scheduler.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    _, preds = torch.max(outputs, 1)
                    y_pred.extend(preds.tolist())
                    y_true.extend(labels.tolist())

                epoch_acc = balanced_accuracy_score(y_true, y_pred)
                if phase == 'val':
                    wandb.log({
                        "epoch_acc_val": epoch_acc,
                    })
                else:
                    epoch_loss = running_loss / self.dataset_sizes[phase]
                    wandb.log({
                        "epoch_acc_train": epoch_acc,
                        "epoch_loss_train": epoch_loss,
                    })

                checkpoint = {
                    'epoch': epoch + 1,
                    'state_dict': self.model.state_dict(),
                    'optimizer': self.optimizer.state_dict(),
                    'best_acc': self.best_acc
                }

                # deep copy the model
                if phase == 'val' and epoch_acc > self.best_acc:
                    save_ckp(checkpoint, True, "checkpoint", 'best_model')
                    self.best_acc = epoch_acc
                    wandb.log({
                        "best_acc_val": self.best_acc
                    })
                save_ckp(checkpoint, False, "checkpoint", 'best_model')
        return self.model

In [15]:
class ModelTrainer2:

    def __init__(self, device, model, criterion, optimizer, dataset, validation_split=.2,
                 batch_size=32, shuffle_dataset=True,
                 random_seed=42, resume_from_checkpoint=False, project_name="Training", architecture="Unknown",
                 num_epochs=25, initial_lr=None, num_classes=None, weighted_sampler=False):

        os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
        os.environ['TORCH_USE_CUDA_DSA'] = '1'

        if resume_from_checkpoint:
            self.model, self.optimizer, self.start_epoch, self.best_acc = load_ckp(
                'checkpoint/checkpoint.pt',
                model, optimizer)
        else:
            self.model = model
            self.optimizer = optimizer
            self.start_epoch = 0
            self.best_acc = 0.0

        self.model.to(device)
        self.criterion = criterion

        self.dataset = dataset
        self.shuffle_dataset = shuffle_dataset
        self.random_seed = random_seed
        self.dataset_size = len(dataset)
        self.validation_split = validation_split
        self.device = device
        self.batch_size = batch_size
        self.lr_scheduler = None
        self.epochs = num_epochs
        self.nr_batch_report = 5
        self.num_classes = num_classes 

        test_size = int(self.validation_split * len(dataset))
        train_size = len(dataset)
        train_dataset = dataset
        _, test_dataset = torch.utils.data.random_split(dataset, [len(dataset) - test_size, test_size])

        if not weighted_sampler:
            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                                       num_workers=0, drop_last=True)
        else:
            self.weighted_sampler = get_weighted_sampler(train_dataset)
            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                                       sampler=self.weighted_sampler,
                                                       num_workers=0, drop_last=True)

        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
                                                  num_workers=0, drop_last=True)

        self.dataloaders = {'train': train_loader,
                            'val': test_loader}

        self.dataset_sizes = {"train": train_size, "val": test_size}

        torch.backends.cudnn.benchmark = True

        torch.backends.cuda.matmul.allow_tf32 = True

        # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
        torch.backends.cudnn.allow_tf32 = True

        wandb.init(
            # set the wandb project where this run will be logged
            project=project_name,

            # track hyper parameters and run metadata
            config={
                "architecture": architecture,
                "epochs": num_epochs,
                "nr_batch_report": self.nr_batch_report,
                "initial_lr": initial_lr if initial_lr else "Unknown",
                "batch_size": self.batch_size
            }
        )

        wandb.log({"train_size": train_size, "test_size": test_size})

    def set_lr_scheduler(self, scheduler):
        self.lr_scheduler = scheduler

    def train_models(self):
        model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
        dataset = SexismDataset(model_name, "/kaggle/input/custom/train_data_final.csv")
        for i in range(10):
            scaler = torch.cuda.amp.GradScaler()

            model = SexismDetectionModel(model_name, num_classes=5).to(self.device)
            optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
            for epoch in range(2):
                # Each epoch has a training and validation phase
                wandb.log({
                    "epoch": epoch
                })
                for phase in ['train', 'val']:
                    torch.cuda.empty_cache()
                    if phase == 'train':
                        model.train()  # Set model to training mode
                    else:
                        model.eval()  # Set model to evaluate mode

                    running_loss = 0.0
                    running_corrects = 0

                    # Iterate over data.
                    y_true = []
                    y_pred = []

                    for batch_idx, (inputs, labels) in enumerate(self.dataloaders[phase]):
                        inputs = inputs.to(self.device)
                        labels = torch.as_tensor(labels).to(self.device)

                        # zero the parameter gradients
                        optimizer.zero_grad()

                        # forward
                        with torch.cuda.amp.autocast():
                            with torch.set_grad_enabled(phase == 'train'):
                                outputs = model(inputs)
                            if type(outputs) is tuple:
                                outputs = outputs[0]
                            loss = self.criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            scaler.scale(loss).backward()
                            scaler.step(optimizer)
                            scaler.update()

                            if batch_idx % self.nr_batch_report == 0:
                                wandb.log({"loss": loss.item()})
                                if self.lr_scheduler:
                                    wandb.log({"lr": self.lr_scheduler.get_last_lr()[-1]})
                                    self.lr_scheduler.step()

                        # statistics
                        running_loss += loss.item() * inputs.size(0)
                        _, preds = torch.max(outputs, 1)
                        y_pred.extend(preds.tolist())
                        y_true.extend(labels.tolist())

                    epoch_acc = balanced_accuracy_score(y_true, y_pred)
                    if phase == 'val':
                        wandb.log({
                            "epoch_acc_val": epoch_acc,
                        })
                    else:
                        epoch_loss = running_loss / self.dataset_sizes[phase]
                        wandb.log({
                            "epoch_acc_train": epoch_acc,
                            "epoch_loss_train": epoch_loss,
                        })

                    checkpoint = {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'best_acc': self.best_acc
                    }

                    # deep copy the model
                    if phase == 'val' and epoch_acc > self.best_acc:
                        save_ckp(checkpoint, True, "checkpoint", 'best_model', i)
                        self.best_acc = epoch_acc
                        wandb.log({
                            "best_acc_val": self.best_acc
                        })
                    save_ckp(checkpoint, False, "checkpoint", 'best_model', i)
            del model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
torch.manual_seed(123)
model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
dataset = SexismDataset(model_name, "/kaggle/input/custom/train_data_final.csv")
model = SexismDetectionModel(model_name, num_classes=5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

if not os.path.exists("best_model"):
    os.makedirs("best_model")
    
if not os.path.exists("checkpoint"):
    os.makedirs("checkpoint")

# if the program was interrupted we will resume from the last saved model
print('Type 0 to exit, 1 to start fresh, 2 from checkpoint')
inp = input()
if inp == '0':
    exit(0)

trainer = ModelTrainer(device, model, criterion, optimizer, dataset,
                        resume_from_checkpoint=(inp == '2'), project_name="Sexism NLP Hackathon",
                        architecture="Bert-Romanian SD", num_epochs=3, batch_size=256, initial_lr=2e-5,
                        validation_split=0.2,
                        num_classes=5, weighted_sampler=True)
trainer.train_model()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
model = SexismDetectionModel(model_name, num_classes=5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
model, optimizer, last_epoch, acc = load_ckp('best_model/best_model.pt', model, optimizer)
dataset = SexismDataset(model_name, '/kaggle/input/custom/test_data_final.csv', test=True)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=False, num_workers=0)

to_label = {
    0: 'direct',
    1: 'descriptive',
    2: 'reporting',
    3: 'offensive',
    4: 'non-offensive'
}

predictions = []
ids = [i for i in range(len(dataset))]
with torch.no_grad():
    for inp in dataloader:
        inp = inp.to(device)
        outputs = model(inp)
        if type(outputs) is tuple:
            outputs = outputs[0]
        _, preds = torch.max(outputs, 1)
        predictions.extend(preds.tolist())


pred = [to_label[pr] for pr in predictions]
output = pd.DataFrame({'Id': ids, 'Label': pred})
output.to_csv('submissionv_clean.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "dumitrescustefan/bert-base-romanian-cased-v1"

to_label = {
    0: 'direct',
    1: 'descriptive',
    2: 'reporting',
    3: 'offensive',
    4: 'non-offensive'
}

dataset = SexismDataset(model_name, '/kaggle/input/custom/test_data_nomentions_sanitized.csv', test=True)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=False, num_workers=0)

pred_all = []
for i in range(6):
    model = SexismDetectionModel(model_name, num_classes=5).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
    
    try:
        model, optimizer, last_epoch, acc = load_ckp(f'checkpoint/checkpoint{i}.pt', model, optimizer)
    
        predictions = []
        ids = [i for i in range(len(dataset))]
        with torch.no_grad():
            for inp in dataloader:
                inp = inp.to(device)
                outputs = model(inp)
                if type(outputs) is tuple:
                    outputs = outputs[0]
                _, preds = torch.max(outputs, 1)
                predictions.extend(preds.tolist())

        del model
        pred_all.append(predictions)
        print('true')
    except:
        del model
        continue
    
pred_all_np = np.array(pred_all)
final_pred = []
for col in range(pred_all_np.shape[1]):
    unique, counts = np.unique(pred_all_np[:, col], return_counts=True)
    max_index = np.argmax(counts)
    max_freq_num = unique[max_index]
    final_pred.append(max_freq_num)

pred = [to_label[pr] for pr in final_pred]
output = pd.DataFrame({'Id': ids, 'Label': pred})
output.to_csv('submissionv13.csv', index=False)
print("Your submission was successfully saved!")


    