In [None]:
# Directory Tree:
'''
data
├── classifier             # Directory containing classifier data
│   ├── inp_classifier_test.pkl    # Pickled file containing test data for classifier input
│   ├── inp_classifier_train.pkl   # Pickled file containing train data for classifier input
│   └── inp_classifier_val.pkl     # Pickled file containing validation data for classifier input
├── embeddings             # Directory containing embeddings data
│   ├── test_embeddings.pkl    # Pickled file containing embeddings for test data
│   ├── train_embeddings.pkl   # Pickled file containing embeddings for train data
│   └── val_embeddings.pkl     # Pickled file containing embeddings for validation data
├── original               # Directory containing original data
│   ├── test.json      # JSON file containing original test data
│   └── train.json     # JSON file containing original train data
└── processed              # Directory containing processed data
    ├── test.pkl       # Pickled file containing processed test data
    ├── train.pkl      # Pickled file containing processed train data
    └── val.pkl        # Pickled file containing processed validation data
'''

# Data Formats

'''
-processed:         
    ->List(
        -ID                              # conversation_ID
        -List(sentences)                 # encoded and padded
        -List(speakers)                  # one-hot-encoded 
        -List(emotions)                  # label-encoded
        -List(cause)                     # 1/0 representing whether sentence is cause of emotion or not
        -List(target_ID,cause_ID)        # Emotion-Cause Pair 
    )


-embeddings:         
    ->List(
        -ID                              # conversation_ID
        -List(emotion_embedding)         # emotion capturing embeddings
        -List(cause_embedding)           # cause capturing embeddings
        -List(emotion_label)             # ground truth(label-encoded) for emotions
        -List(emotion_pred)              # predictions(Y/N) for emotions
        -List(cause_label)               # ground truth(label-encoded) for causes
        -List(cause_pred)                # predictions(Y/N) for causes
        -List(target_ID,cause_ID)        # Emotion-Cause Pair 
    )
    
    
-classifier:         
    ->List(
        -ID                              # conversation_ID
        -List(label_pair)                # ground truth(Y/N) for Emotion-Cause Pair 
        -List(label_emotion)             # ground truth(label-encoded) for emotions
        -List(features)                  # concatenated emotion_embedding+cause_embedding for proposed emotion-cause pair
        -List(indexes)                   # indexes proposed emotion-cause pair
    )
    
    
-inp_span:
    ->Dict(
        -train
        -val
            -ID                          # Conv_ID,Target_ID,Cause_ID
            -emotion                     # emotion for Traget_ID
    )
    

'''

pass

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import OneHotEncoder
import json
from copy import deepcopy as cpy

from torch.utils.data import Dataset,DataLoader
import torch.nn as nn

from sklearn.metrics import *

import gc

import warnings
warnings.filterwarnings('ignore')

In [None]:
DEVICE='cpu'
RANDOM_SEED = 42

import os
import numpy as np
import random

if torch.cuda.is_available():
    DEVICE='cuda'
    
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_all(RANDOM_SEED)

In [None]:
import pickle

class LoadSave:
    def __init__(self,path):
        self.path=path

    def load(self):
        f = open(self.path, 'rb')
        loaded_obj=pickle.load(f)
        f.close()
        return loaded_obj

    def save(self,here):        
        f=open(self.path, 'wb')
        pickle.dump(here,f)
        f.close()

In [None]:
import shutil 

try:
    os.mkdir("./data")
except:
    pass

try:
    os.mkdir("./data/classifier")
except:
    pass

try:
    os.mkdir("./data/embeddings")
except:
    pass

try:
    os.mkdir("./data/processed")
except:
    pass

try:
    os.mkdir("./data/span")
except:
    pass

try:
    os.mkdir("./models")
except:
    pass

try:
    shutil.copytree("/kaggle/input/project-dataset/data/original", "./data/original")  
except:
    pass


shutil.copy("/kaggle/input/project-dataset/data/test.json", "./data/test.json")  
shutil.copy("/kaggle/input/project-dataset/data/train.json", "./data/train.json") 

pass

In [None]:
!pip install sentence_transformers 

In [None]:
from sentence_transformers import SentenceTransformer
S_Bert=SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
class Data():
    def __init__(self,load=True):
        self.load=load
        if(load):
            return

        with open("/kaggle/input/project-dataset/data/train.json","r") as f:
            data=json.load(f)
            
        self.MAX_SENTENCES=35
        self.UNKNOWN_SPEAKER="unknown"
        self.PADDED_SPEAKER="padded" #handle_unknown='ignore'
        self.UNKNOWN_EMOTION="undefined"
        self.UNKNOWN_CAUSE=2
        self.emotion_map={
            self.UNKNOWN_EMOTION:-1,
            'disgust':0,
            'joy':1,
            'surprise':2,
            'anger':3,
            'fear':4,
            'neutral':5,
            'sadness':6
        }
        
        self.speakers=set()
        for conv in data:
            for sent in conv["conversation"]:
                self.speakers.add(sent["speaker"])
        self.speakers.add(self.UNKNOWN_SPEAKER)
        
        speakers=list(self.speakers)
        
        self.speaker_encoder= OneHotEncoder(sparse=False,handle_unknown='ignore')
        
        self.speaker_encoder.fit([[label] for label in speakers])
            
        
            
    def processor(self,data):
        final_data=[]
        
        # {
        #     "sentences"->[],
        #     "speaker_onehot"->[],
        #     "emotion(Y/N)"->[],
        #     "cause(Y/N)"->[],
        #     "target_cause(index,index)"->[],
        # }
        
        for conv in data:
            temp={
                "sentences":[],
                "emotions":[],
                "speakers":[],
                "cause":[0 for _ in range(self.MAX_SENTENCES)],
                "target_cause":[]
            }
            filled=len(conv["conversation"])
          
            for sent in conv["conversation"]:
                temp["sentences"].append(sent["text"])
                temp["emotions"].append(sent["emotion"])
                if sent["speaker"] in self.speakers:
                    temp["speakers"].append([sent["speaker"]])
                else:
                    temp["speakers"].append([self.UNKNOWN_SPEAKER])
                
            for _ in range(0,self.MAX_SENTENCES-filled):
                temp["sentences"].append(" ")
                temp["emotions"].append(self.UNKNOWN_EMOTION)
                temp["speakers"].append([self.PADDED_SPEAKER])
                
            temp["speakers"]=self.speaker_encoder.transform(temp["speakers"])
            
            temp["sentences"]=S_Bert.encode(temp["sentences"],show_progress_bar=False)
            
            temp["emotions"]=[self.emotion_map[i] for i in temp["emotions"]]
            
            for pair in conv["emotion-cause_pairs"]:
                temp["cause"][int(pair["cause_id"])-1]=1
                temp["target_cause"].append([int(pair["target_id"]),int(pair["cause_id"])])
                
            for padd_idx in range(filled,self.MAX_SENTENCES):
                temp["cause"][padd_idx]=self.UNKNOWN_CAUSE
                
                
            temp["sentences"]=temp["sentences"][:self.MAX_SENTENCES]
            temp["emotions"]=np.array(temp["emotions"][:self.MAX_SENTENCES])
            temp["speakers"]=temp["speakers"][:self.MAX_SENTENCES]
            temp["cause"]=np.array(temp["cause"][:self.MAX_SENTENCES])
            temp["target_cause"]=np.array(temp["target_cause"])
            
            temp["ID"]=int(conv["conversation_ID"])
                
            final_data.append(cpy(temp))
           
        return final_data
            

    def train_data(self): 
        if(self.load):
            temp=LoadSave("/kaggle/input/project-dataset/data/processed/train_processed.pkl")
            return temp.load()
        with open("/kaggle/input/project-dataset/data/train.json","r") as f:
            data=json.load(f)
            
        return self.processor(data)

            
    def val_data(self):
        if(self.load):
            temp=LoadSave("/kaggle/input/project-dataset/data/processed/val_processed.pkl")
            return temp.load()
        with open("/kaggle/input/project-dataset/data/test.json","r") as f:
            data=json.load(f)
        
        return self.processor(data)


In [None]:
data=Data(False)

# data=Data(True) #yeh wala hain!!!

In [None]:
train=data.train_data()
val=data.val_data()

In [None]:
temp=LoadSave("./data/processed/train_processed.pkl")
temp.save(train)

temp=LoadSave("./data/processed/val_processed.pkl")
temp.save(val)

In [None]:
mapper={}

class Samples(Dataset):
    def __init__(self,data):
        self.data=data
    
        for i in range(len(self.data)):
            mapper[int(self.data[i]["ID"])]=self.data[i].pop("target_cause",[])   
      
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        return self.data[idx]

In [None]:
BATCH_SIZE=4

train_loader=DataLoader(Samples(train),batch_size=BATCH_SIZE)
val_loader=DataLoader(Samples(val),batch_size=BATCH_SIZE)

target_cause_map=cpy(mapper)

In [None]:
class Emotion_Rep(nn.Module):
    def __init__(self):
        super().__init__()
        SIZE=1024
#         self.lstm= nn.LSTM(input_size=384, hidden_size=SIZE, batch_first=True, bidirectional=True)
        self.lstm= nn.LSTM(input_size=384+283, hidden_size=SIZE, batch_first=True, bidirectional=True)
        
        self.layer= nn.Sequential(
            nn.Linear(SIZE*2, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 2),
        )
        
#     def forward(self, x):
#         x_ = self.lstm(x)[0]
#         x_ = self.layer(x_)

#         return x_

#     def embedding(self,x):
#         return self.lstm(x)[0]
  

    def forward(self, x, x1):
        x=torch.cat((x,x1),dim=2)
        x_ = self.lstm(x)[0]
        x_ = self.layer(x_)

        return x_
    
    def embedding(self,x, x1):
        x=torch.cat((x,x1),dim=2)
        return self.lstm(x)[0]
    
class Cause_Rep(nn.Module):
    def __init__(self):
        SIZE=1024
        super().__init__()
#         self.lstm= nn.LSTM(input_size=384, hidden_size=SIZE, batch_first=True, bidirectional=True)
        self.lstm= nn.LSTM(input_size=384+283, hidden_size=SIZE, batch_first=True, bidirectional=True)
        
        self.layer= nn.Sequential(
            nn.Linear(SIZE*2, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 2),
        )
        
#     def forward(self, x):
#         x_ = self.lstm(x)[0]
#         x_ = self.layer(x_)

#         return x_
    
#     def embedding(self,x):
#         return self.lstm(x)[0]


    def forward(self, x, x1):
        x=torch.cat((x,x1),dim=2)
        x_ = self.lstm(x)[0]
        x_ = self.layer(x_)

        return x_
    
    def embedding(self,x, x1):
        x=torch.cat((x,x1),dim=2)
        return self.lstm(x)[0]

In [None]:
emotion_model=Emotion_Rep().to(DEVICE)
cause_model=Cause_Rep().to(DEVICE)

In [None]:
criterion_emotion = nn.CrossEntropyLoss()
criterion_cause = nn.CrossEntropyLoss()

optimizer= torch.optim.Adam(list(emotion_model.parameters())+list(cause_model.parameters()), lr=1e-3)
# optimizer= torch.optim.Adam(emotion_model.parameters(), lr=1e-3)

In [None]:
logger={"emotion":[],"cause":[]}

print("Training\n")
NUM_EPOCHS=5

for i in range(NUM_EPOCHS):
    
    train_loss=0
    val_loss=0
    batch_train=0
    batch_val=0
    
    emotion_model.train()
    cause_model.train()
    
    labels={
        "emotion":{
            "train":{
                "pred":[],
                "true":[]
            },
            "val":{
                "pred":[],
                "true":[]
            }
        },
        "cause":{
            "train":{
                "pred":[],
                "true":[]
            },
            "val":{
                "pred":[],
                "true":[]
            }
        }
    }
    
    for batch in train_loader:
        
        inp=batch["sentences"].to(DEVICE)
        emotion_ids=batch["emotions"].to(DEVICE)
        cause_ids=batch["cause"].to(DEVICE)
        spk=batch["speakers"].to(torch.long).to(DEVICE)

#         print("EMOTION")
        outputs_emotion=emotion_model(inp,spk)
#         outputs_emotion=emotion_model(inp,spk)
#         connection=emotion_model.embedding(inp)
        

        outputs_emotion=outputs_emotion.view(-1, 2).float()
        emotion_ids=emotion_ids.view(-1).to(torch.long)


        mask=emotion_ids!=-1
        outputs_emotion=outputs_emotion[mask]
        emotion_ids=emotion_ids[mask]
        
        mask=emotion_ids!=5
        emotion_ids[mask]=1
        mask=emotion_ids==5
        emotion_ids[mask]=0
        
        labels["emotion"]["train"]["true"]+=emotion_ids.cpu().detach().numpy().tolist()
        temp=torch.argmax(outputs_emotion,dim=1)
        labels["emotion"]["train"]["pred"]+=temp.cpu().detach().numpy().tolist()
        loss_emotion=criterion_emotion(outputs_emotion,emotion_ids)


#         print("CAUSE")
    
        outputs_cause=cause_model(inp,spk)

        outputs_cause=outputs_cause.view(-1, 2).float()
        cause_ids=cause_ids.view(-1).to(torch.long)

        mask= cause_ids!=2
        outputs_cause=outputs_cause[mask]
        cause_ids=cause_ids[mask]
        
        labels["cause"]["train"]["true"]+=cause_ids.cpu().detach().numpy().tolist()
        temp=torch.argmax(outputs_cause,dim=1)
        labels["cause"]["train"]["pred"]+=temp.cpu().detach().numpy().tolist()

        loss_cause=criterion_cause(outputs_cause,cause_ids)
        
        E_size=emotion_ids.shape[0]
        C_size=cause_ids.shape[0]        
        
        net_loss=loss_emotion/E_size+loss_cause/C_size 
        
        train_loss+=net_loss.item()
        batch_train+=1

        optimizer.zero_grad()
        net_loss.backward()
        optimizer.step()

        
    train_loss/=batch_train
    
    
    emotion_model.eval()
    cause_model.eval()
    
    for batch in val_loader:

        inp=batch["sentences"].to(DEVICE)
        emotion_ids=batch["emotions"].to(DEVICE)
        cause_ids=batch["cause"].to(DEVICE)
        spk=batch["speakers"].to(torch.long).to(DEVICE)

        
#         print("EMOTION_VAL")
        outputs_emotion=emotion_model(inp,spk)
#         outputs_emotion=emotion_model(inp,spk)
    
#         connection=emotion_model.embedding(inp)

        outputs_emotion=outputs_emotion.view(-1, 2).float()
        emotion_ids=emotion_ids.view(-1).to(torch.long)

        mask=emotion_ids!=-1
        outputs_emotion=outputs_emotion[mask]
        emotion_ids=emotion_ids[mask]
        
        mask=emotion_ids!=5
        emotion_ids[mask]=1
        mask=emotion_ids==5
        emotion_ids[mask]=0

        labels["emotion"]["val"]["true"]+=emotion_ids.cpu().detach().numpy().tolist()
        temp=torch.argmax(outputs_emotion,dim=1)
        labels["emotion"]["val"]["pred"]+=temp.cpu().detach().numpy().tolist()
        loss_emotion=criterion_emotion(outputs_emotion,emotion_ids)


#         print("CAUSE_VAL")
        outputs_cause=cause_model(inp,spk)

        outputs_cause=outputs_cause.view(-1, 2).float()
        cause_ids=cause_ids.view(-1).to(torch.long)

        mask= cause_ids!=2
        outputs_cause=outputs_cause[mask]
        cause_ids=cause_ids[mask]

        labels["cause"]["val"]["true"]+=cause_ids.cpu().detach().numpy().tolist()
        temp=torch.argmax(outputs_cause,dim=1)
        labels["cause"]["val"]["pred"]+=temp.cpu().detach().numpy().tolist()
        loss_cause=criterion_cause(outputs_cause,cause_ids)
        
        
        E_size=emotion_ids.shape[0]
        C_size=cause_ids.shape[0]
        net_loss=loss_emotion/E_size+loss_cause/C_size 
        
        val_loss+=net_loss.item()
        batch_val+=1

        
    val_loss/=batch_val
    
    print("Epoch->",i)
    print("\tTrain->",train_loss,"\tVal->",val_loss)
    
#     print(labels)
    
#     print("\tTrain_emotion")
#     print(classification_report(labels["emotion"]["train"]["true"],labels["emotion"]["train"]["pred"]))
#     print("\tVal_emotion")
#     print(classification_report(labels["emotion"]["val"]["true"],labels["emotion"]["val"]["pred"]))
#     print("\tTrain_cause")
#     print(classification_report(labels["cause"]["train"]["true"],labels["cause"]["train"]["pred"]))
#     print("\tVal_cause")
#     print(classification_report(labels["cause"]["val"]["true"],labels["cause"]["val"]["pred"]))
#     print("\n\n\n")

print("Trained\n")
# print(logger)

# with open("./models/Cause_Rep.pt","wb") as f: 
#     torch.save(cause_model,f)
    
# with open("./models/Emotion_Rep.pt","wb") as f: 
#     torch.save(emotion_model,f)
    
torch.save(emotion_model,"Emotion_Rep.pt")
torch.save(cause_model,"Cause_Rep.pt")

In [None]:
train_embeddings_loader=DataLoader(Samples(train),batch_size=1)
val_embeddings_loader=DataLoader(Samples(val),batch_size=1)


In [None]:
train_embeddings=[]
val_embeddings=[]

print("Generating embeddings\n")


emotion_model.eval()
cause_model.eval()


for batch in train_embeddings_loader:
    temp={}
    temp["ID"]=int(batch["ID"][0].item())
    inp=batch["sentences"].to(DEVICE)
    emotion_ids=batch["emotions"].to(DEVICE)
    cause_ids=batch["cause"].to(DEVICE)
    spk=batch["speakers"].to(torch.long).to(DEVICE)
    
    mask=emotion_ids!=-1
    emotion_ids=emotion_ids[mask]
    cause_ids=cause_ids[mask]
    inp=torch.unsqueeze(inp[mask],0)
    spk=torch.unsqueeze(spk[mask],0)
    
    temp["emotion_label"]=emotion_ids.detach().cpu().numpy()
    temp["cause_label"]=cause_ids.detach().cpu().numpy()
    temp["emotion_embedding"]=emotion_model.embedding(inp,spk).squeeze(dim=0).detach().cpu().numpy()
    temp["cause_embedding"]=cause_model.embedding(inp,spk).squeeze(dim=0).detach().cpu().numpy()
    

    outputs_emotion=emotion_model(inp,spk)
    outputs_emotion=outputs_emotion.view(-1, 2).float()
    emotion_ids=emotion_ids.view(-1).to(torch.long)
    
    opt=torch.argmax(outputs_emotion,dim=1).cpu().detach().numpy().tolist()
    temp["emotion_pred"]=opt
    
    
    
    outputs_cause=cause_model(inp,spk)
    outputs_cause=outputs_cause.view(-1, 2).float()
    cause_ids=cause_ids.view(-1).to(torch.long)

    
    opt=torch.argmax(outputs_cause,dim=1).cpu().detach().numpy().tolist()
    temp["cause_pred"]=opt

    temp["target_cause"]=target_cause_map[temp["ID"]]

    
    train_embeddings.append(temp)
    
print("\tGenerated Train")
    

for batch in val_embeddings_loader:
    temp={}
    temp["ID"]=int(batch["ID"][0].item())
    inp=batch["sentences"].to(DEVICE)
    emotion_ids=batch["emotions"].to(DEVICE)
    cause_ids=batch["cause"].to(DEVICE)
    spk=batch["speakers"].to(torch.long).to(DEVICE)
    
    mask=emotion_ids!=-1
    emotion_ids=emotion_ids[mask]
    cause_ids=cause_ids[mask]
    inp=torch.unsqueeze(inp[mask],0)
    spk=torch.unsqueeze(spk[mask],0)
    
    temp["emotion_label"]=emotion_ids.detach().cpu().numpy()
    temp["cause_label"]=cause_ids.detach().cpu().numpy()
    temp["emotion_embedding"]=emotion_model.embedding(inp,spk).squeeze(dim=0).detach().cpu().numpy()
    temp["cause_embedding"]=cause_model.embedding(inp,spk).squeeze(dim=0).detach().cpu().numpy()
    

    outputs_emotion=emotion_model(inp,spk)
    outputs_emotion=outputs_emotion.view(-1, 2).float()
    emotion_ids=emotion_ids.view(-1).to(torch.long)

    
    opt=torch.argmax(outputs_emotion,dim=1).cpu().detach().numpy().tolist()
    temp["emotion_pred"]=opt
    
    
    
    outputs_cause=cause_model(inp,spk)
    outputs_cause=outputs_cause.view(-1, 2).float()
    cause_ids=cause_ids.view(-1).to(torch.long)

    
    opt=torch.argmax(outputs_cause,dim=1).cpu().detach().numpy().tolist()
    temp["cause_pred"]=opt

    temp["target_cause"]=target_cause_map[temp["ID"]]

    
    val_embeddings.append(temp)


print("\tGenerated Val")



In [None]:

temp=LoadSave("./data/embeddings/train_embeddings.pkl")
temp.save(train_embeddings)

temp=LoadSave("./data/embeddings/val_embeddings.pkl")
temp.save(val_embeddings)

In [None]:
# train_embeddings=[]
# val_embeddings=[]


# temp=LoadSave("/kaggle/input/project-dataset/data/embeddings/train_embeddings.pkl")
# train_embeddings=temp.load()

# temp=LoadSave("/kaggle/input/project-dataset/data/embeddings/val_embeddings.pkl")
# val_embeddings=temp.load()

In [None]:
train_conversations=[]
val_conversations=[]

for conv in train_embeddings:
    temp={
        "label_pair":[],
        "label_emotion":[],
        "features":[],
        "indexes":[]
    }
    
    pairs={}
    
    em_mask=np.array(conv["emotion_pred"])==1
    ca_mask=np.array(conv["cause_pred"])==1
    
    em_ids=np.where(em_mask==True)[0]
    ca_ids=np.where(ca_mask==True)[0]

    
    em_embed=conv["emotion_embedding"][em_mask]
    ca_embed=conv["cause_embedding"][ca_mask]
        
    for i in conv["target_cause"]:
        pairs[tuple(i.tolist())]=True

    
    for em,em_id in zip(em_embed,em_ids):
        for ca,ca_id in zip(ca_embed,ca_ids):
            temp["features"].append(np.concatenate([em,ca]))
            temp["label_emotion"].append(conv["emotion_label"][em_id])
            
            here=(em_id+1,ca_id+1)
            try:
                pairs[here]
                temp["label_pair"].append(1)
            except KeyError:
                temp["label_pair"].append(0)
            temp["indexes"].append([conv["ID"],em_id+1,ca_id+1])
                            
    temp["features"]=np.array(temp["features"])
    temp["label_emotion"]=np.array(temp["label_emotion"])
    temp['label_pair']=np.array(temp['label_pair'])
    temp["indexes"]=np.array(temp["indexes"])
    train_conversations.append(temp)
    
    
           
    
for conv in val_embeddings:
    temp={
        "label_pair":[],
        "label_emotion":[],
        "features":[],
        "indexes":[]
    }
    
    pairs={}
    
    em_mask=np.array(conv["emotion_pred"])==1
    ca_mask=np.array(conv["cause_pred"])==1
    
    em_ids=np.where(em_mask==True)[0]
    ca_ids=np.where(ca_mask==True)[0]

    
    em_embed=conv["emotion_embedding"][em_mask]
    ca_embed=conv["cause_embedding"][ca_mask]
        
    for i in conv["target_cause"]:
        pairs[tuple(i.tolist())]=True

    
    for em,em_id in zip(em_embed,em_ids):
        for ca,ca_id in zip(ca_embed,ca_ids):
            temp["features"].append(np.concatenate([em,ca]))
            temp["label_emotion"].append(conv["emotion_label"][em_id])
            
            here=(em_id+1,ca_id+1)
            try:
                pairs[here]
                temp["label_pair"].append(1)
            except KeyError:
                temp["label_pair"].append(0)
            temp["indexes"].append([conv["ID"],em_id+1,ca_id+1])
                            
    temp["features"]=np.array(temp["features"])
    temp["label_emotion"]=np.array(temp["label_emotion"])
    temp['label_pair']=np.array(temp['label_pair'])
    temp["indexes"]=np.array(temp["indexes"])
            
    val_conversations.append(temp)
        
    

In [None]:
temp=LoadSave("./data/classifier/inp_classifier_train.pkl")
temp.save(train_conversations)

temp=LoadSave("./data/classifier/inp_classifier_val.pkl")
temp.save(val_conversations)

In [None]:
# train_conversations=[]
# val_conversations=[]

# temp=LoadSave("/kaggle/input/project-dataset/data/classifier/inp_classifier_train.pkl")
# train_conversations=temp.load()

# temp=LoadSave("/kaggle/input/project-dataset/data/classifier/inp_classifier_val.pkl")
# val_conversations=temp.load()

In [None]:
label_pair_train=[]
label_emotion_train=[]
features_train=[]
identifier_train=[]

label_pair_val=[]
label_emotion_val=[]
features_val=[]
identifier_val=[]

In [None]:
class Samples_Classifier(Dataset):
    def __init__(self,feature,pair,emotion,identifier):
        self.feature=torch.Tensor(feature)
        self.pair=torch.tensor(pair)
        self.emotion=torch.tensor(emotion)
        self.identifier=torch.tensor(identifier)
        
    def __len__(self):
        return len(self.feature)
    
    def __getitem__(self,idx):
        return self.feature[idx],self.pair[idx],self.emotion[idx],self.identifier[idx]

In [None]:
BATCH_SIZE=16
print("BatchMaking")

for i in train_conversations:
    label_pair_train+=i["label_pair"].tolist()
    label_emotion_train+=i["label_emotion"].tolist()
    features_train+=i["features"].tolist()
    identifier_train+=i["indexes"].tolist()
    
train_loader=DataLoader(Samples_Classifier(features_train,label_pair_train,label_emotion_train,identifier_train),batch_size=BATCH_SIZE)

# del label_pair_train
# del label_emotion_train
# del features_train
gc.collect()
    
    
for i in val_conversations:
    label_pair_val+=i["label_pair"].tolist()
    label_emotion_val+=i["label_emotion"].tolist()
    features_val+=i["features"].tolist()
    identifier_val+=i["indexes"].tolist()
    
val_loader=DataLoader(Samples_Classifier(features_val,label_pair_val,label_emotion_val,identifier_val),batch_size=BATCH_SIZE)

# del label_pair_val
# del label_emotion_val
# del features_val
gc.collect()


In [None]:
class Classify_Pair(nn.Module):
    def __init__(self, input_size, output_size=2):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.LeakyReLU(),
            nn.Linear(2048, 1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, output_size)
        )
    
    def forward(self, x):
        return self.layer(x)

In [None]:
model_pair = Classify_Pair(4096).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_pair.parameters(), lr=1e-5)

In [None]:
print("Training_Classify_Pair\n")
NUM_EPOCHS=15

for i in range(NUM_EPOCHS):
    labels={
        "train":{
            "pred":[],
            "true":[]
        },
        "val":{
            "pred":[],
            "true":[]
        }
    }
    
    train_loss=0
    val_loss=0
    batch_train=0
    batch_val=0
    
    model_pair.train()
    
    
    for feature,pair,emotion,_ in train_loader:

        feature=feature.to(DEVICE)
        pair=pair.to(DEVICE)
        emotion=emotion.to(DEVICE)

        outputs=model_pair(feature)

        outputs=outputs.view(-1, 2).float()
        emotion=emotion.view(-1).to(torch.long)
        pair=pair.view(-1).to(torch.long)
        
        labels["train"]["true"]+=pair.cpu().detach().numpy().tolist()
        temp=torch.argmax(outputs,dim=1)
        labels["train"]["pred"]+=temp.cpu().detach().numpy().tolist()

        loss=criterion(outputs,pair)
        
        batch_train+=1

        train_loss+=loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    model_pair.eval()
    
    for feature,pair,emotion,_ in val_loader:

        feature=feature.to(DEVICE)
        pair=pair.to(DEVICE)
        emotion=emotion.to(DEVICE)

        outputs=model_pair(feature)
        

        outputs=outputs.view(-1, 2).float()
        emotion=emotion.view(-1).to(torch.long)
        pair=pair.view(-1).to(torch.long)

        
        labels["val"]["true"]+=pair.cpu().detach().numpy().tolist()
        temp=torch.argmax(outputs,dim=1)
        labels["val"]["pred"]+=temp.cpu().detach().numpy().tolist()


        loss=criterion(outputs,pair)

        val_loss+=loss.item()
        
        batch_val+=1

    
    print("Epoch->",i)
    print("\tTrain->",train_loss/batch_train,"\tVal->",val_loss/batch_val)
    
    print("\tTrain")
    print(np.unique(labels["train"]["pred"],return_counts=True))
    print(classification_report(labels["train"]["true"],labels["train"]["pred"]))
    print("\tVal")
    print(np.unique(labels["val"]["pred"],return_counts=True))
    print(classification_report(labels["val"]["true"],labels["val"]["pred"]))
    print("\n\n\n")

print("Trained\n")
# print(logger)

# with open("./models/model_pair.pt","wb") as f: 
#     torch.save(model_pair,f)

torch.save(model_pair,"model_pair.pt")

In [None]:
class Classify_Emotion(nn.Module):
    def __init__(self, input_size, output_size=7):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.LeakyReLU(),
            nn.Linear(2048, 1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.Linear(256, output_size)
        )
    
    def forward(self, x):
        return self.layer(x)

In [None]:
model = Classify_Emotion(4096).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [None]:
input_span={
    "train":{
        "ID":[],
        "emotion":[],
    },
    "val":{
        "ID":[],
        "emotion":[],
    }
}

In [None]:
print("Training_Classify_Emotion\n")
NUM_EPOCHS=10
epsilon = 1e-6

for i in range(NUM_EPOCHS):
    labels={
        "train":{
            "pred":[],
            "true":[],
            "identifier":[]
        },
        "val":{
            "pred":[],
            "true":[],
            "identifier":[]
        }
    }
    
    input_span={
        "train":{
            "ID":[],
            "emotion":[],
        },
        "val":{
            "ID":[],
            "emotion":[],
        }
    }
    
    train_loss=0
    val_loss=0
    batch_train=0
    batch_val=0
    
    model.train()
    
    
    for feature,pair,emotion,identifier in train_loader:

        feature=feature.to(DEVICE)
        pair=pair.to(DEVICE)
        emotion=emotion.to(DEVICE)
        identifier=identifier.to(DEVICE)

        outputs=model_pair(feature)
        outputs=outputs.view(-1, 2).float()
        outputs=torch.argmax(outputs,dim=1)
        mask=outputs==1
        
                
        outputs=model(feature)
        outputs=outputs.add(epsilon)
        outputs=outputs.view(-1, 7).float()
        emotion=emotion.view(-1).to(torch.long)
        
        outputs=outputs[mask]
        emotion=emotion[mask]
        identifier=identifier[mask]
                
        
        labels["train"]["true"]+=emotion.cpu().detach().numpy().tolist()
        temp=torch.argmax(outputs,dim=1)
        labels["train"]["pred"]+=temp.cpu().detach().numpy().tolist()
        labels["train"]["identifier"]+=identifier.cpu().detach().numpy().tolist()
        
        

        loss=criterion(outputs,emotion)
        
        batch_train+=1
        
#         if(not (torch.isnan(loss))):
        train_loss+=loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
    
    model.eval()
    
    
    for feature,pair,emotion,identifier in val_loader:

        feature=feature.to(DEVICE)
        pair=pair.to(DEVICE)
        emotion=emotion.to(DEVICE)
        identifier=identifier.to(DEVICE)

        outputs=model_pair(feature)
        outputs=outputs.view(-1, 2).float()
        outputs=torch.argmax(outputs,dim=1)
        mask=outputs==1
        
        outputs=model(feature)
        outputs=outputs.add(epsilon)
        outputs=outputs.view(-1, 7).float()
        emotion=emotion.view(-1).to(torch.long)
        
        outputs=outputs[mask]
        emotion=emotion[mask]
        identifier=identifier[mask]
        
        
        labels["val"]["true"]+=emotion.cpu().detach().numpy().tolist()
        temp=torch.argmax(outputs,dim=1)
        labels["val"]["pred"]+=temp.cpu().detach().numpy().tolist()
        labels["val"]["identifier"]+=identifier.cpu().detach().numpy().tolist()


        loss=criterion(outputs,emotion)

#         if(not (torch.isnan(loss))):
        val_loss+=loss.item()
        
        batch_val+=1
        
    
    print("Epoch->",i)
    print("\tTrain->",train_loss/batch_train,"\tVal->",val_loss/batch_val)


    print("\tTrain")
    a=[]
    b=[]
    
    for i in range (len(labels["train"]["true"])):
        true=labels["train"]["true"][i]
        pred=labels["train"]["pred"][i]
        identity=labels["train"]["identifier"][i]
        if(pred!=5):
            a.append(true)
            b.append(pred)
            input_span["train"]["ID"].append(identity)
            input_span["train"]["emotion"].append(pred)
            
   
    print(classification_report(a,b))
    
    print("\tVal")
    a=[]
    b=[]
    
    for i in range (len(labels["val"]["true"])):
        true=labels["val"]["true"][i]
        pred=labels["val"]["pred"][i]
        identity=labels["val"]["identifier"][i]
        if(pred!=5):
            a.append(true)
            b.append(pred)
            input_span["val"]["ID"].append(identity)
            input_span["val"]["emotion"].append(pred)
            
    print(classification_report(a,b))
    print("\n\n\n")

print("Trained\n")
# print(logger)

# with open("./models/model_emotion.pt","wb") as f: 
#     torch.save(model,f)
    
torch.save(model,"model_emotion.pt")

In [None]:
temp=LoadSave("./data/span/input_span.pkl")
temp.save(input_span)