In [None]:
'''
Citation:

[1] NLTK. [Online]. Available: https://www.nltk.org/. [Accessed: 28-Jul-2022]. 
[2] “Pandas,” pandas. [Online]. Available: https://pandas.pydata.org/.
[3] “SKLearn,” scikit-learn. [Online]. Available: https://scikit-learn.org/stable/.
[4] “WordNet,” Princeton University. [Online]. Available: https://wordnet.princeton.edu/.
[5] V. PRASANNA KUMAR and T. Patro, “Bert model with 0.845 accuracy,” Kaggle, 23-Aug-2020. [Online]. Available: https://www.kaggle.com/code/vpkprasanna/bert-model-with-0-845-accuracy/notebook.
'''

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from sklearn.model_selection import train_test_split
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet

In [None]:
# Getting the Preprocessed Data for Augmentation.
data_df =  pd.read_csv('../NER/train.csv',index_col=0)
data_df['Remedies'] = data_df['Remedies'].apply(lambda x: ast.literal_eval(x))

In [None]:
# Labellization for Train Data.
multilabel = MultiLabelBinarizer()
d = multilabel.fit_transform(data_df['Remedies'])
label_df = pd.DataFrame(d,columns=multilabel.classes_)
label_df['Labels'] = label_df.values.tolist()
label_df = label_df.drop(list(label_df.columns)[:-1],axis = 1)

# Concating Label list to the Main DataFrame
data_df = pd.concat([data_df, label_df], axis=1)


In [None]:
len(list(set(list([item for sublist in data_df['Remedies'] for item in sublist]))))

In [None]:
# Splitting data into Train and Train Dataset
train_df, test_df = train_test_split(data_df, test_size=0.1, random_state=42, shuffle=True)

In [None]:
# Data Augmentation part (Synonymical) for Train Data.
print('Length before Data Augmentation of Train Data:',len(train_df))
for s in range(len(train_df)):
    synonyms = []
    for k in train_df['PSE'][s].split(', '):
        for syn in wordnet.synsets(k):
            for i in syn.lemmas():
                if k !=i.name():
                    synonyms.append([k,i.name()])
    for sn in synonyms:
        train_df = train_df.append({'Comments':train_df['Comments'].iloc[s],'DateTime':train_df['DateTime'].iloc[s],'Components':train_df['Components'].iloc[s],'PSE':', '.join(list(set(list(train_df['PSE'].iloc[s].replace(str(sn[0]),str(sn[1])).split(', '))))),'PosTag_Remedies':train_df['PosTag_Remedies'].iloc[s],'Remedies':train_df['Remedies'].iloc[s],'Labels':train_df['Labels'].iloc[s]},ignore_index=True)

# Normalization, concating of two input columns into one(as CONTEXT) for Train Data.
train_df['CONTEXT'] = train_df['Components']+', '+train_df['PSE']
temp_clm = train_df.pop("CONTEXT")
train_df.insert(6, "CONTEXT", temp_clm)

# Exploding the Labels Column for Train Dataset.
train_df[list(multilabel.classes_)] = pd.DataFrame(train_df.Labels.tolist(),index=train_df.index)
train_df = train_df.drop(['Labels'],axis=1)
train_df['Remedies'] = train_df['Remedies'].astype(str)
# Removing duplicates
train_df = train_df.drop_duplicates(ignore_index=True)
train_df.reset_index(drop=True, inplace=True)

print('Length after Data Augmentation of Train Data:',len(train_df))

In [None]:
# Normalization, concating of two input columns into one(as CONTEXT) for Test Data.
test_df['CONTEXT'] = test_df['Components']+', '+test_df['PSE']
temp_clm = test_df.pop("CONTEXT")
test_df.insert(6, "CONTEXT", temp_clm)

# Exploding the Labels Column for Test Dataset.
test_df[list(multilabel.classes_)] = pd.DataFrame(test_df.Labels.tolist(),index=test_df.index)
test_df = test_df.drop(['Labels'],axis=1)
test_df.reset_index(drop=True, inplace=True)
test_df['Remedies'] = test_df['Remedies'].astype(str)


# Removing duplicates
test_df = test_df.drop_duplicates(ignore_index=True)

In [None]:
# Total of each Outputs in the Train Set.
lab_val = []

for k in list(train_df.columns)[7:]:
    if sum(list(train_df[k])) != 1:
        lab_val.append([k,sum(list(train_df[k]))])
labl_train_cnt_df = pd.DataFrame(lab_val,columns=['Label Name','Label Count'])
labl_train_cnt_df.to_csv('Train_Count.csv')

In [None]:
# Total of each Outputs in the Train Set.
count_ls = {}

for k in list(test_df.columns)[7:]:
    count_ls[k] = sum(list(test_df[k]))
count_ls

In [None]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


In [None]:
# Saving it Datasets in the form of Train and Test for further Model Training.
train_df.to_csv('data/Train.csv')
test_df.to_csv('data/Test.csv')