In [1]:
import os
import re
import string
import json
import emoji
import numpy as np
import pandas as pd
from sklearn import metrics
from bs4 import BeautifulSoup
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv("train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
df_dev = pd.read_csv("dev.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [3]:
df_train['List of classes'] = df_train['Class'].apply(lambda x: x.split(','))
df_train['Len of classes'] = df_train['List of classes'].apply(lambda x: len(x))
df_dev['List of classes'] = df_dev['Class'].apply(lambda x: x.split(','))
df_dev['Len of classes'] = df_dev['List of classes'].apply(lambda x: len(x))

In [4]:
with open('ekman_mapping.json') as file:
    ekman_mapping = json.load(file)

In [5]:
emotion_file = open("emotions.txt", "r")
emotion_list = emotion_file.read()
emotion_list = emotion_list.split("\n")
print(emotion_list)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [6]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(emotion_list[int(i)])
    return arr

In [7]:
df_train['Emotions'] = df_train['List of classes'].apply(idx2class)
df_dev['Emotions'] = df_dev['List of classes'].apply(idx2class)

In [8]:
def EmotionMapping(emotion_list):
    map_list = []
    
    for i in emotion_list:
        if i in ekman_mapping['anger']:
            map_list.append('anger')
        if i in ekman_mapping['disgust']:
            map_list.append('disgust')
        if i in ekman_mapping['fear']:
            map_list.append('fear')
        if i in ekman_mapping['joy']:
            map_list.append('joy')
        if i in ekman_mapping['sadness']:
            map_list.append('sadness')
        if i in ekman_mapping['surprise']:
            map_list.append('surprise')
        if i == 'neutral':
            map_list.append('neutral')
            
    return map_list

In [9]:
df_train['Mapped Emotions'] = df_train['Emotions'].apply(EmotionMapping)
df_dev['Mapped Emotions'] = df_dev['Emotions'].apply(EmotionMapping)

In [10]:
df_train['anger'] = np.zeros((len(df_train),1))
df_train['disgust'] = np.zeros((len(df_train),1))
df_train['fear'] = np.zeros((len(df_train),1))
df_train['joy'] = np.zeros((len(df_train),1))
df_train['sadness'] = np.zeros((len(df_train),1))
df_train['surprise'] = np.zeros((len(df_train),1))
df_train['neutral'] = np.zeros((len(df_train),1))

df_dev['anger'] = np.zeros((len(df_dev),1))
df_dev['disgust'] = np.zeros((len(df_dev),1))
df_dev['fear'] = np.zeros((len(df_dev),1))
df_dev['joy'] = np.zeros((len(df_dev),1))
df_dev['sadness'] = np.zeros((len(df_dev),1))
df_dev['surprise'] = np.zeros((len(df_dev),1))
df_dev['neutral'] = np.zeros((len(df_dev),1))

In [11]:
for i in ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise','neutral']:
    df_train[i] = df_train['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)
    df_dev[i] = df_dev['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)

In [12]:
df_train.head()

Unnamed: 0,Text,Class,ID,List of classes,Len of classes,Emotions,Mapped Emotions,anger,disgust,fear,joy,sadness,surprise,neutral
0,My favourite food is anything I didn't have to...,27,eebbqej,[27],1,[neutral],[neutral],0,0,0,0,0,0,1
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,[27],1,[neutral],[neutral],0,0,0,0,0,0,1
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,[2],1,[anger],[anger],1,0,0,0,0,0,0
3,To make her feel threatened,14,ed7ypvh,[14],1,[fear],[fear],0,0,1,0,0,0,0
4,Dirty Southern Wankers,3,ed0bdzj,[3],1,[annoyance],[anger],1,0,0,0,0,0,0


In [13]:
df_dev.head()

Unnamed: 0,Text,Class,ID,List of classes,Len of classes,Emotions,Mapped Emotions,anger,disgust,fear,joy,sadness,surprise,neutral
0,Is this in New Orleans?? I really feel like th...,27,edgurhb,[27],1,[neutral],[neutral],0,0,0,0,0,0,1
1,"You know the answer man, you are programmed to...",427,ee84bjg,"[4, 27]",2,"[approval, neutral]","[joy, neutral]",0,0,0,1,0,0,1
2,I've never been this sad in my life!,25,edcu99z,[25],1,[sadness],[sadness],0,0,0,0,1,0,0
3,The economy is heavily controlled and subsidiz...,427,edc32e2,"[4, 27]",2,"[approval, neutral]","[joy, neutral]",0,0,0,1,0,0,1
4,He could have easily taken a real camera from ...,20,eepig6r,[20],1,[optimism],[joy],0,0,0,1,0,0,0


In [14]:
df_train.drop(df_train[df_train['neutral'] == 1].index, inplace=True)
df_dev.drop(df_dev[df_dev['neutral'] == 1].index, inplace=True)
df_train.drop(df_train[df_train['disgust'] == 1].index, inplace=True)
df_dev.drop(df_dev[df_dev['disgust'] == 1].index, inplace=True)

In [15]:
df_train.drop(['Class', 'List of classes', 'Len of classes', 'Emotions', 'Mapped Emotions', 'neutral', 'disgust'], axis=1, inplace=True)
df_dev.drop(['Class', 'List of classes', 'Len of classes', 'Emotions', 'Mapped Emotions', 'neutral', 'disgust'], axis=1, inplace=True)

In [16]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", 
                       "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
                       "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
                       "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am",
                       "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                       "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                       "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
                       "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                       "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would",
                       "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have",
                       "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
                       "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                       "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have",
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'u.s':'america', 'e.g':'for example'}

punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-",
                 "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 
                 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!':' '}

mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater',
                'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
                'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 
                'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 
                'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization',
                'demonetisation': 'demonetization'}

In [17]:
def clean_text(text):
    '''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = emoji.demojize(text)
    text = re.sub(r'\:(.*?)\:','',text)
    text = str(text).lower()    #Making Text Lowercase
    text = re.sub('\[.*?\]', '', text)
    #The next 2 lines remove html text
    text = BeautifulSoup(text, 'lxml').get_text()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
    return text

def clean_contractions(text, mapping):
    '''Clean contraction using contraction mapping'''    
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    for word in mapping.keys():
        if ""+word+"" in text:
            text = text.replace(""+word+"", ""+mapping[word]+"")
    #Remove Punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    return text

def clean_special_chars(text, punct, mapping):
    '''Cleans special characters present(if any)'''   
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def correct_spelling(x, dic):
    '''Corrects common spelling errors'''   
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

def remove_space(text):
    '''Removes awkward spaces'''   
    #Removes awkward spaces 
    text = text.strip()
    text = text.split()
    return " ".join(text)

def text_preprocessing_pipeline(text):
    '''Cleaning and parsing the text.'''
    text = clean_text(text)
    text = clean_contractions(text, contraction_mapping)
    text = clean_special_chars(text, punct, punct_mapping)
    text = correct_spelling(text, mispell_dict)
    text = remove_space(text)
    return text

In [18]:
# df_train['Text'] = df_train['Text'].apply(text_preprocessing_pipeline)
# df_dev['Text'] = df_dev['Text'].apply(text_preprocessing_pipeline)

In [19]:
df_train.reset_index(drop=True).to_csv("train.csv", index=False)
df_dev.reset_index(drop=True).to_csv("val.csv", index=False)

In [20]:
df_train = df_train.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)

In [21]:
df_train.head()

Unnamed: 0,Text,ID,anger,fear,joy,sadness,surprise
0,WHY THE FUCK IS BAYLESS ISOING,eezlygj,1,0,0,0,0
1,To make her feel threatened,ed7ypvh,0,1,0,0,0
2,Dirty Southern Wankers,ed0bdzj,1,0,0,0,0
3,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,edvnz26,0,0,0,0,1
4,Yes I heard abt the f bombs! That has to be wh...,ee3b6wu,0,0,1,0,0


In [22]:
print(df_train.shape)
print(df_dev.shape)

(28427, 7)
(3564, 7)


In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [25]:
target_cols = [col for col in df_train.columns if col not in ['Text', 'ID']]
target_cols

['anger', 'fear', 'joy', 'sadness', 'surprise']

In [26]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.Text
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [27]:
train_dataset = BERTDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = BERTDataset(df_dev, tokenizer, MAX_LEN)

In [28]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=4, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, 
                          num_workers=4, shuffle=False, pin_memory=True)

In [29]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
#         self.l2 = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768,5)
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
        output = self.fc(features)
        return output

model = BERTClass()
model.to(device);

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [31]:
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

In [32]:
# criterion = torch.nn.BCEWithLogitsLoss()

In [33]:
# # Training loop with history tracking
# history = {'train_accuracy': [], 'train_loss': [], 'val_accuracy': [], 'val_loss': []}

# for epoch in range(EPOCHS):
#     # Training
#     model.train()
#     train_loss = 0.0
#     train_corrects = 0
#     processed = 0  # Initialize processed variable

#     for batch_idx, data in enumerate(train_loader, 0):
#         ids = data['ids'].to(device, dtype=torch.long)
#         mask = data['mask'].to(device, dtype=torch.long)
#         token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
#         targets = data['targets'].to(device, dtype=torch.float)

#         outputs = model(ids, mask, token_type_ids)
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()

#         # Update processed variable
#         processed += len(targets)

#         # Print training progress:
#         print(f"Epoch {epoch + 1}/{EPOCHS}, Batch {batch_idx + 1}/{len(train_loader)}, "
#               f"Loss={loss.item():.4f}, Accuracy={train_corrects / processed:.2f}%")

#         train_loss += loss.item() * len(targets)
#         train_corrects += torch.sum(torch.round(torch.sigmoid(outputs)) == targets.data)

#     train_loss /= len(df_train)
#     train_accuracy = train_corrects.double() / len(df_train)

#     # Validation
#     model.eval()
#     val_loss = 0.0
#     val_corrects = 0

#     with torch.no_grad():
#         for _, data in enumerate(valid_loader, 0):
#             ids = data['ids'].to(device, dtype=torch.long)
#             mask = data['mask'].to(device, dtype=torch.long)
#             token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
#             targets = data['targets'].to(device, dtype=torch.float)

#             outputs = model(ids, mask, token_type_ids)
#             loss = criterion(outputs, targets)

#             val_loss += loss.item() * len(targets)
#             val_corrects += torch.sum(torch.round(torch.sigmoid(outputs)) == targets.data)

#     val_loss /= len(df_dev)
#     val_accuracy = val_corrects.double() / len(df_dev)

#     # Append accuracy and loss values to history
#     history['train_accuracy'].append(train_accuracy.item())
#     history['train_loss'].append(train_loss)
#     history['val_accuracy'].append(val_accuracy.item())
#     history['val_loss'].append(val_loss)

#     print(f"Epoch {epoch + 1}/{EPOCHS}, "
#           f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
#           f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

In [34]:
from tqdm import tqdm
tqdm.pandas()

In [35]:
def train(epoch):
    model.train()
    fin_targets=[]
    fin_outputs=[]
    train_loss=[]
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc='Train ')
    for _, data in pbar:
#     for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)

#         if _%500 == 0:
#             print(f'Epoch: {epoch}, Loss:  {loss.item()}')
#         print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss.append(loss.item())

        fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        
    fin_outputs = np.array(fin_outputs) >= 0.5
    accuracy = metrics.accuracy_score(fin_targets, fin_outputs)
    return accuracy,sum(train_loss)/len(train_loss)
    

In [36]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    val_loss=[]
    with torch.no_grad():
        pbar = tqdm(enumerate(valid_loader), total=len(valid_loader), desc='Valid ')
        for _, data in pbar:
#         for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            val_loss.append(loss.item())
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        
        fin_outputs = np.array(fin_outputs) >= 0.5
        accuracy = metrics.accuracy_score(fin_targets, fin_outputs)
    return accuracy,sum(val_loss)/len(val_loss)

In [37]:
# history = {'train_accuracy': [], 'train_loss': [], 'val_accuracy': [], 'val_loss': []}

# for epoch in range(15):
#     train_accuracy,train_loss=train(epoch)
#     val_accuracy,val_loss=validation()
#     history['train_accuracy'].append(train_accuracy)
#     history['train_loss'].append(train_loss)
#     history['val_accuracy'].append(val_accuracy)
#     history['val_loss'].append(val_loss)
    

In [38]:
# import torch

# # Ensure you have your device setup for GPU if available
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# # Assuming your train_loader and val_loader are already defined
# history = {
#     'train_accuracy': [],
#     'train_loss': [],
#     'val_accuracy': [],
#     'val_loss': []
# }

# scaler = torch.cuda.amp.GradScaler()  # For mixed precision training

# for epoch in range(15):
#     model.train()
#     train_accuracy, train_loss = 0.0, 0.0
#     for data, target in train_loader:
#         data, target = data.to(device), target.to(device)
#         optimizer.zero_grad()
        
#         with torch.cuda.amp.autocast():  # Mixed precision context
#             output = model(data)
#             loss = criterion(output, target)
        
#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()
        
#         # Calculate train accuracy and loss
#         train_loss += loss.item() * data.size(0)
#         _, predicted = torch.max(output, 1)
#         correct = (predicted == target).sum().item()
#         train_accuracy += correct

#     # Average over all batches
#     train_loss /= len(train_loader.dataset)
#     train_accuracy /= len(train_loader.dataset)

#     # Perform validation
#     val_accuracy, val_loss = validate()

#     # Store the metrics in history
#     history['train_accuracy'].append(train_accuracy)
#     history['train_loss'].append(train_loss)
#     history['val_accuracy'].append(val_accuracy)
#     history['val_loss'].append(val_loss)
# # 

In [39]:
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.profiler import profile, record_function, ProfilerActivity

# Dummy train and validation functions (you should replace these with your actual functions)
def train(epoch):
    model.train()
    train_loss = 0
    correct = 0
    for data, target in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * data.size(0)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
    
    train_loss /= len(train_loader.dataset)
    train_accuracy = correct / len(train_loader.dataset)
    return train_accuracy, train_loss

def validate():
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in val_loader:
            output = model(data)
            val_loss += criterion(output, target).item() * data.size(0)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    
    val_loss /= len(val_loader.dataset)
    val_accuracy = correct / len(val_loader.dataset)
    return val_accuracy, val_loss

# Dummy data loaders (you should replace these with your actual data loaders)
train_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(torch.randn(1000, 10), torch.randint(0, 2, (1000,))),
    batch_size=32,
    shuffle=True
)
val_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(torch.randn(200, 10), torch.randint(0, 2, (200,))),
    batch_size=32,
    shuffle=False
)

# Dummy model, criterion, and optimizer (you should replace these with your actual model, criterion, and optimizer)
model = torch.nn.Linear(10, 2)
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

history = {
    'train_accuracy': [],
    'train_loss': [],
    'val_accuracy': [],
    'val_loss': []
}

# Profiling
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    for epoch in range(15):
        with record_function("train"):
            train_accuracy, train_loss = train(epoch)
        with record_function("validate"):
            val_accuracy, val_loss = validate()
        history['train_accuracy'].append(train_accuracy)
        history['train_loss'].append(train_loss)
        history['val_accuracy'].append(val_accuracy)
        history['val_loss'].append(val_loss)

# Print profiling results
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                  train        21.61%     303.722ms        87.81%        1.234s      82.279ms            15  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...        22.78%     320.232ms        52.79%     741.940ms       1.206ms           615  
                                           aten::select        16.21%     227.777ms        17.87%     251.111ms       6.975us         36000  
                                               validate         1.37%      19.223ms        12.19%     171.362ms      11.424ms            15  
      

In [40]:
history

{'train_accuracy': [0.506,
  0.506,
  0.507,
  0.505,
  0.507,
  0.511,
  0.509,
  0.504,
  0.508,
  0.505,
  0.507,
  0.517,
  0.513,
  0.513,
  0.511],
 'train_loss': [0.792926646232605,
  0.768752799987793,
  0.7497316236495972,
  0.7346740913391113,
  0.7233669605255127,
  0.7152758641242981,
  0.7086454339027405,
  0.7041488428115845,
  0.7002819595336914,
  0.6979957680702209,
  0.6964565601348877,
  0.6951469793319702,
  0.6944373993873596,
  0.6938393459320068,
  0.6935570149421691],
 'val_accuracy': [0.53,
  0.525,
  0.53,
  0.52,
  0.495,
  0.5,
  0.5,
  0.495,
  0.5,
  0.49,
  0.495,
  0.49,
  0.49,
  0.475,
  0.455],
 'val_loss': [0.7983388948440552,
  0.774374783039093,
  0.7556307220458984,
  0.7413459634780883,
  0.7308582425117492,
  0.7215012168884277,
  0.7158073925971985,
  0.7108525252342224,
  0.7069826412200928,
  0.7047593069076538,
  0.7024299621582031,
  0.7006842136383057,
  0.6998989224433899,
  0.6994072127342225,
  0.6993584060668945]}

In [41]:
pd.DataFrame(history). to_csv("resultmain.csv")

In [42]:
torch.save(model.state_dict(), 'model.bin')

In [43]:
# outputs, targets = validation()
# outputs = np.array(outputs) >= 0.5
# accuracy = metrics.accuracy_score(targets, outputs)
# f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
# f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
# print(f"Accuracy Score = {accuracy}")
# print(f"F1 Score (Micro) = {f1_score_micro}")
# print(f"F1 Score (Macro) = {f1_score_macro}")

In [44]:
pip install numba


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: C:\Users\Ap\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [45]:
import numpy as np
from sklearn import metrics
import time
from numba import jit
from joblib import Parallel, delayed
import cProfile
import pstats

# Example validation function (replace this with your actual function)
def validation():
    outputs = np.random.rand(100000)  # Large number of predictions
    targets = np.random.randint(0, 2, 100000)  # Corresponding binary targets
    return outputs, targets

# Convert binary conversion function to use Numba
@jit(nopython=True)
def binary_conversion(outputs):
    return outputs >= 0.5

# Split data into chunks
def split_data(data, n_chunks):
    chunk_size = len(data) // n_chunks
    return [data[i*chunk_size:(i+1)*chunk_size] for i in range(n_chunks)]

# Main function to run the validation and scoring
def main():
    # Start timing
    start_time = time.time()

    # Retrieve outputs and targets
    outputs, targets = validation()

    # Split data into chunks
    n_chunks = 4  # Number of parallel processes
    output_chunks = split_data(outputs, n_chunks)

    # Convert outputs to binary predictions in parallel
    outputs_binary = np.concatenate(Parallel(n_jobs=n_chunks)(delayed(binary_conversion)(chunk) for chunk in output_chunks))

    # Calculate accuracy and F1 scores
    accuracy = metrics.accuracy_score(targets, outputs_binary)
    f1_score_micro = metrics.f1_score(targets, outputs_binary, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs_binary, average='macro')

    # Print results
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

    # End timing
    end_time = time.time()
    print(f"Time taken: {end_time - start_time} seconds")

# Profile the main function
cProfile.run('main()', 'profile_stats')

# Print profiling results
p = pstats.Stats('profile_stats')
p.sort_stats('cumulative').print_stats(10)


Accuracy Score = 0.49831
F1 Score (Micro) = 0.49831
F1 Score (Macro) = 0.4983098482387291
Time taken: 2.6352005004882812 seconds
Fri Jul 12 20:52:00 2024    profile_stats

         15343 function calls (15243 primitive calls) in 2.636 seconds

   Ordered by: cumulative time
   List reduced from 654 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      4/1    0.000    0.000    2.636    2.636 {built-in method builtins.exec}
        1    0.000    0.000    2.636    2.636 <string>:1(<module>)
        1    0.000    0.000    2.636    2.636 C:\Users\Ap\AppData\Local\Temp\ipykernel_9100\3467309218.py:26(main)
        1    0.000    0.000    2.472    2.472 C:\Users\Ap\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\parallel.py:1847(__call__)
        6    0.000    0.000    2.454    0.409 C:\Users\Ap\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\parallel.py:1583(_get_outputs)
        1    0.006    0.006    2.

<pstats.Stats at 0x283771b52d0>

In [46]:
# from transformers import AutoConfig, AutoModel

In [47]:
#config = AutoConfig.from_pretrained('bert-base-uncased')
#model =  AutoModel.from_config(config)

In [48]:
#PATH = ""
#model.load_state_dict(torch.load(PATH))