In [23]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import OneHotEncoder
import json
from copy import deepcopy as cpy

from torch.utils.data import Dataset,DataLoader
import torch.nn as nn

from sklearn.metrics import *

import gc

import warnings
warnings.filterwarnings('ignore')

In [24]:
DEVICE='cpu'
RANDOM_SEED = 42

import os
import numpy as np
import random

if torch.cuda.is_available():
    DEVICE='cuda'
    
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_all(RANDOM_SEED)

In [25]:
import pickle

class LoadSave:
    def __init__(self,path):
        self.path=path

    def load(self):
        f = open(self.path, 'rb')
        loaded_obj=pickle.load(f)
        f.close()
        return loaded_obj

    def save(self,here):        
        f=open(self.path, 'wb')
        pickle.dump(here,f)
        f.close()

In [26]:
!pip install sentence_transformers 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [27]:
from sentence_transformers import SentenceTransformer
S_Bert=SentenceTransformer("all-MiniLM-L6-v2")

In [28]:
class Data():
    def __init__(self,load=True):
        self.load=load
        if(load):
            return

        with open("/kaggle/input/project-dataset/data/train.json","r") as f:
            data=json.load(f)
            
        self.MAX_SENTENCES=35
        self.UNKNOWN_SPEAKER="unknown"
        self.PADDED_SPEAKER="padded" #handle_unknown='ignore'
        self.UNKNOWN_EMOTION="undefined"
        self.UNKNOWN_CAUSE=2
        self.emotion_map={
            self.UNKNOWN_EMOTION:-1,
            'disgust':0,
            'joy':1,
            'surprise':2,
            'anger':3,
            'fear':4,
            'neutral':5,
            'sadness':6
        }
        
        self.speakers=set()
        for conv in data:
            for sent in conv["conversation"]:
                self.speakers.add(sent["speaker"])
        self.speakers.add(self.UNKNOWN_SPEAKER)
        
        speakers=list(self.speakers)
        
        self.speaker_encoder= OneHotEncoder(sparse=False,handle_unknown='ignore')
        
        self.speaker_encoder.fit([[label] for label in speakers])
            
        
            
    def processor(self,data):
        final_data=[]
        
        for conv in data:
            temp={
                "sentences":[],
                "speakers":[],
                "padded":[]
            }
            
            filled=len(conv["conversation"])
          
            for sent in conv["conversation"]:
                temp["sentences"].append(sent["text"])
                temp["padded"].append(0)
                if sent["speaker"] in self.speakers:
                    temp["speakers"].append([sent["speaker"]])
                else:
                    temp["speakers"].append([self.UNKNOWN_SPEAKER])
                
            for _ in range(0,self.MAX_SENTENCES-filled):
                temp["sentences"].append(" ")
                temp["speakers"].append([self.PADDED_SPEAKER])
                temp["padded"].append(1)
                
            temp["speakers"]=self.speaker_encoder.transform(temp["speakers"])
            
            temp["sentences"]=S_Bert.encode(temp["sentences"],show_progress_bar=False)
            
                
                
            temp["sentences"]=temp["sentences"][:self.MAX_SENTENCES]
            temp["speakers"]=temp["speakers"][:self.MAX_SENTENCES]
            temp["padded"]=np.array(temp["padded"][:self.MAX_SENTENCES])
            temp["ID"]=int(conv["conversation_ID"])
                
            final_data.append(cpy(temp))
           
        return final_data
            

    def train_data(self): 
        with open("/kaggle/input/project-dataset/data/train.json","r") as f:
            data=json.load(f)
            
        return self.processor(data)

            
    def val_data(self):
        with open("/kaggle/input/project-dataset/data/test.json","r") as f:
            data=json.load(f)
        
        return self.processor(data)
    
    def test_data(self):
        with open("/kaggle/input/project-dataset/data/original/test.json","r") as f:
            data=json.load(f)
        
        return self.processor(data)


In [29]:
data=Data(False)

In [30]:
# val=data.val_data()

val=data.test_data()

In [31]:
class Samples(Dataset):
    def __init__(self,data):
        self.data=data      
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        return self.data[idx]

In [32]:
class Emotion_Rep(nn.Module):
    def __init__(self):
        super().__init__()
        SIZE=1024
        self.lstm= nn.LSTM(input_size=384+283, hidden_size=SIZE, batch_first=True, bidirectional=True)
        
        self.layer= nn.Sequential(
            nn.Linear(SIZE*2, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 2),
        )
  

    def forward(self, x, x1):
        x=torch.cat((x,x1),dim=2)
        x_ = self.lstm(x)[0]
        x_ = self.layer(x_)

        return x_
    
    def embedding(self,x, x1):
        x=torch.cat((x,x1),dim=2)
        return self.lstm(x)[0]
    
class Cause_Rep(nn.Module):
    def __init__(self):
        SIZE=1024
        super().__init__()
        self.lstm= nn.LSTM(input_size=384+283, hidden_size=SIZE, batch_first=True, bidirectional=True)
        
        self.layer= nn.Sequential(
            nn.Linear(SIZE*2, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 2),
        )


    def forward(self, x, x1):
        x=torch.cat((x,x1),dim=2)
        x_ = self.lstm(x)[0]
        x_ = self.layer(x_)

        return x_
    
    def embedding(self,x, x1):
        x=torch.cat((x,x1),dim=2)
        return self.lstm(x)[0]

In [33]:
emotion_model=Emotion_Rep().to(DEVICE)
cause_model=Cause_Rep().to(DEVICE)

with open("/kaggle/input/project-dataset/models/Emotion_Rep.pt","rb") as f:
    emotion_model=torch.load(f,map_location=DEVICE)
    
with open("/kaggle/input/project-dataset/models/Cause_Rep.pt","rb") as f:
    cause_model=torch.load(f,map_location=DEVICE)



In [34]:
val_embeddings_loader=DataLoader(Samples(val),batch_size=1)

In [35]:
val_embeddings=[]

print("Generating embeddings\n")


emotion_model.eval()
cause_model.eval()


for batch in val_embeddings_loader:
    temp={}
    temp["ID"]=int(batch["ID"][0].item())
    inp=batch["sentences"].to(DEVICE)
    spk=batch["speakers"].to(torch.long).to(DEVICE)
    mask=batch["padded"].to(DEVICE)
    
    mask=mask!=1
    
    inp=torch.unsqueeze(inp[mask],0)
    spk=torch.unsqueeze(spk[mask],0)
    
    temp["emotion_embedding"]=emotion_model.embedding(inp,spk).squeeze(dim=0).detach().cpu().numpy()
    temp["cause_embedding"]=cause_model.embedding(inp,spk).squeeze(dim=0).detach().cpu().numpy()
    

    outputs_emotion=emotion_model(inp,spk)
    outputs_emotion=outputs_emotion.view(-1, 2).float()
    
    opt=torch.argmax(outputs_emotion,dim=1).cpu().detach().numpy().tolist()
    temp["emotion_pred"]=opt
    
    
    
    outputs_cause=cause_model(inp,spk)
    outputs_cause=outputs_cause.view(-1, 2).float()

    
    opt=torch.argmax(outputs_cause,dim=1).cpu().detach().numpy().tolist()
    temp["cause_pred"]=opt
    
    val_embeddings.append(temp)

print("\tGenerated Val")



Generating embeddings

	Generated Val


In [36]:
val_conversations=[]

    
for conv in val_embeddings:
    temp={
        "features":[],
        "indexes":[]
    }
    
    em_mask=np.array(conv["emotion_pred"])==1
    ca_mask=np.array(conv["cause_pred"])==1
    
    em_ids=np.where(em_mask==True)[0]
    ca_ids=np.where(ca_mask==True)[0]

    
    em_embed=conv["emotion_embedding"][em_mask]
    ca_embed=conv["cause_embedding"][ca_mask]

    
    for em,em_id in zip(em_embed,em_ids):
        for ca,ca_id in zip(ca_embed,ca_ids):
            temp["features"].append(np.concatenate([em,ca]))
            temp["indexes"].append([conv["ID"],em_id+1,ca_id+1])
                            
    temp["features"]=np.array(temp["features"])
    temp["indexes"]=np.array(temp["indexes"])
    val_conversations.append(temp)
        
    

In [37]:
features_train=[]
identifier_train=[]

features_val=[]
identifier_val=[]

In [38]:
class Samples_Classifier(Dataset):
    def __init__(self,feature,identifier):
        self.feature=torch.Tensor(feature)
        self.identifier=torch.tensor(identifier)
        
    def __len__(self):
        return len(self.feature)
    
    def __getitem__(self,idx):
        return self.feature[idx],self.identifier[idx]

In [39]:
BATCH_SIZE=128
print("BatchMaking")

    
for i in val_conversations:
    features_val+=i["features"].tolist()
    identifier_val+=i["indexes"].tolist()
    
val_loader=DataLoader(Samples_Classifier(features_val,identifier_val),batch_size=BATCH_SIZE)



BatchMaking


In [40]:
class Classify_Pair(nn.Module):
    def __init__(self, input_size, output_size=2):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.LeakyReLU(),
            nn.Linear(2048, 1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, output_size)
        )
    
    def forward(self, x):
        return self.layer(x)

In [41]:
model_pair = Classify_Pair(4096).to(DEVICE)

with open("/kaggle/input/project-dataset/models/model_pair.pt","rb") as f:
    model_pair=torch.load(f,map_location=DEVICE)


In [42]:
class Classify_Emotion(nn.Module):
    def __init__(self, input_size, output_size=7):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.LeakyReLU(),
            nn.Linear(2048, 1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.Linear(256, output_size)
        )
    
    def forward(self, x):
        return self.layer(x)

In [43]:
model = Classify_Emotion(4096).to(DEVICE)

with open("/kaggle/input/project-dataset/models/model_emotion.pt","rb") as f:
    model=torch.load(f,map_location=DEVICE)


In [44]:
print("Classifing_Emotion\n")
NUM_EPOCHS=10
epsilon = 1e-6

labels={
    "data":{
        "pred":[],
        "identifier":[]
    }
}

input_span={
    "data":{
        "ID":[],
        "emotion":[],
    }
}


model_pair.eval()
model.eval()


for feature,identifier in val_loader:

    feature=feature.to(DEVICE)
    identifier=identifier.to(DEVICE)

    outputs=model_pair(feature)
    outputs=outputs.view(-1, 2).float()
    outputs=torch.argmax(outputs,dim=1)
    mask=outputs==1

    outputs=model(feature)
    outputs=outputs.add(epsilon)
    outputs=outputs.view(-1, 7).float()

    outputs=outputs[mask]
    identifier=identifier[mask]

    temp=torch.argmax(outputs,dim=1)
    labels["data"]["pred"]+=temp.cpu().detach().numpy().tolist()
    labels["data"]["identifier"]+=identifier.cpu().detach().numpy().tolist()
    
for i in range (len(labels["data"]["pred"])):
    pred=labels["data"]["pred"][i]
    identity=labels["data"]["identifier"][i]
    if(pred!=5):
        input_span["data"]["ID"].append(identity)
        input_span["data"]["emotion"].append(pred)



Classifing_Emotion



In [45]:
temp=LoadSave("./input_span.pkl")
temp.save(input_span)

In [46]:
print(len(input_span['data']["ID"]))

1655


In [47]:
from transformers import BertTokenizer, BertModel
from transformers import T5ForConditionalGeneration, T5Tokenizer

from copy import deepcopy as copy
from sklearn.metrics import classification_report,f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer


from transformers import pipeline

2024-04-25 17:48:09.564709: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 17:48:09.564830: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 17:48:09.662052: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [48]:
with open('/kaggle/input/project-dataset/data/original/train.json','r') as f:
    data=json.load(f)

In [49]:
data_based_on_conv_id={}

In [50]:
for i in range(2500):
    data_based_on_conv_id[i+1]=[]

In [51]:
for i in data:
    data_based_on_conv_id[i['conversation_ID']]=i['conversation']

In [52]:
with open('/kaggle/input/project-dataset/data/original/test.json','r') as f:
    test_data=json.load(f)

In [53]:
for i in test_data:
    data_based_on_conv_id[i['conversation_ID']]=i['conversation']

In [54]:
emotion_cause_pair={}

In [55]:
for i in range(2500):
    emotion_cause_pair[i+1]=[]

In [56]:
emotions={'disgust':0,
            'joy':1,
            'surprise':2,
            'anger':3,
            'fear':4,
            'neutral':5,
            'sadness':6}

emotions_decode={0:'disgust',
            1:'joy',
            2:'surprise',
            3:'anger',
            4:'fear',
            5:'neutral',
            6:'sadness'}

In [57]:
len(input_span['data']['ID'])

1655

In [58]:
emotion_cause_pair_2={}

In [59]:
for i in range(2500):
    emotion_cause_pair_2[i+1]=[]

In [60]:
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = pipeline("question-answering",model=model, tokenizer=tokenizer)

for info in range(len(input_span['data']['ID'])):
    conv_id=input_span['data']['ID'][info][0]
    target_sent_idx=input_span['data']['ID'][info][1]
    cause_sent_idx=input_span['data']['ID'][info][2]
    cause=data_based_on_conv_id[conv_id][cause_sent_idx-1]['text']           # C question hai.
    target=data_based_on_conv_id[conv_id][target_sent_idx-1]['text']
    
    emo=input_span['data']['emotion'][info]
    
    context=cause
    question="which part of the context is cause for the utterance '"+target+"' with an "+emotions_decode[emo]+" emotion?"
    
    answer = nlp(question=question, context=context)
    
    start_idx=answer['start']
    end_idx=answer['end']
    ans=answer['answer']
    
    pair1=str(target_sent_idx)+"_"+emotions_decode[emo]
    pair2=str(cause_sent_idx)+"_"+ans
    pair2_2=str(cause_sent_idx)+"_"+str(start_idx)+"_"+str(end_idx)
    emotion_cause_pair[conv_id].append([pair1,pair2])
    emotion_cause_pair_2[conv_id].append([pair1,pair2_2])

    print("Remaining: ",len(input_span['data']['ID'])-info,"\t\r",end="")
    

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Remaining:  1 				

In [61]:
import json


with open('/kaggle/input/project-dataset/data/original/test.json','r') as f:
    test_data=json.load(f)

# print(test_data)


for i in range(len(test_data)):
    conv_id=test_data[i]['conversation_ID']
    test_data[i]['emotion-cause_pairs']=emotion_cause_pair_2[conv_id]


json.dump(test_data, open("Subtask_1_pred.json", 'w'))