In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import math
from collections import Counter
from tqdm import tqdm as tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#from sklearn import datasets

In [2]:
D_intent = pd.read_json('MSDialog-Intent.json')

In [3]:
dialogs = []
labels = []
actors = []
for d in D_intent.loc['utterances']:
    dialogs.append([])
    labels.append([])
    actors.append([])
    for utter in d:
        dialogs[-1].append(utter['utterance'])
        labels[-1].append(utter['tags'])
        actors[-1].append(utter['actor_type'])

In [4]:
def split(dia, labs):
    utter = []
    lab = []
    for i in range(len(dia)):
        utter += dia[i]
        lab += labs[i]
    return utter, lab

In [5]:
utter, lab = split(dialogs, labels)

In [6]:
###

In [7]:
# utter_without_sw = []
# for u in tqdm(utter):
#     text_tokens = word_tokenize(u)
#     text_without_sw = ' '.join([word for word in text_tokens if not word in stopwords.words()])
#     utter_without_sw.append(text_without_sw)

# df_utter_without_sw = pd.DataFrame(utter_without_sw)
# df_utter_without_sw.to_csv(r'utter_without_sw.csv', index=False)

In [8]:
utter_without_sw = pd.read_csv(r'utter_without_sw.csv')
utter_without_sw = utter_without_sw.fillna('')
utter_without_sw = utter_without_sw.squeeze().values.tolist()

In [9]:
utter_without_sw_stem = []
ps = PorterStemmer() 
for u in tqdm(utter_without_sw):
    text_tokens = word_tokenize(u)
    text_without_sw_stem = ' '.join([ps.stem(w) for w in text_tokens])
    utter_without_sw_stem.append(text_without_sw_stem)

100%|███████████████████████████████████████████████████████████████████████████| 10020/10020 [00:11<00:00, 902.57it/s]


In [10]:
is_user = []
for a in actors:
    is_user += a
for i, u in enumerate(is_user):
    if u == 'User':
        is_user[i] = True
    else:
        is_user[i] = False

In [11]:
###

In [12]:
for i in range(len(lab)):
    ls = lab[i].split()
    if 'GG' in ls and len(ls)>1:
        ls.remove('GG')
        lab[i] = ' '.join(ls)
    if 'JK' in ls and len(ls)>1:
        ls.remove('JK')
        lab[i] = ' '.join(ls)
    if 'O' in ls and len(ls)>1:
        ls.remove('O')
        lab[i] = ' '.join(ls)
    lab[i] = lab[i].strip()

In [13]:
import collections
lab_freq = dict(collections.Counter(lab))

In [14]:
label_list = ['O', 'CQ']
for l in sorted(lab_freq.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)[:30]:
    label_list.append(l[0])

In [15]:
def preprocess(lab):
    new_lab = np.zeros([len(lab), len(label_list)])
    for i, l in enumerate(lab):
        for j, l_ in enumerate(label_list):
            if l_ in l:
                new_lab[i,j] = 1
    return new_lab

In [16]:
lab = preprocess(lab)#labels of all data

In [17]:
label_list

['O',
 'CQ',
 'PA',
 'OQ',
 'PF',
 'PA FD',
 'FD',
 'GG',
 'IR PA',
 'FD NF',
 'FD OQ',
 'IR',
 'RQ',
 'PA IR',
 'PF FD',
 'NF',
 'FQ FD',
 'CQ FD',
 'FQ',
 'PA PF',
 'FD RQ',
 'CQ IR',
 'JK',
 'FQ RQ',
 'FD PF',
 'CQ PA',
 'FQ IR',
 'FQ CQ',
 'IR FD',
 'CQ IR PA',
 'RQ OQ',
 'IR OQ']

In [18]:
def text_to_vector(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    return Counter(words)

In [19]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

# Content features

In [20]:
# Initial Utterance Similarity
initial_utter_similarity = []
pos = []
count = 1

for i, dialog in enumerate(dialogs):
    vec1 = text_to_vector(dialog[0])
    count = 1
    for u in dialog:
        pos.append(count)
        count += 1
        vec2 = text_to_vector(u)
        initial_utter_similarity.append(get_cosine(vec1, vec2))
        
data_1 = {'utterance': utter, 'utterance_pos': pos, 'initial_consine_similarity': initial_utter_similarity}
df_1 = pd.DataFrame(data_1)
df_1.to_csv(r'CONTENT_initial_utter_similarity.csv', index=False)
df_1.head()

Unnamed: 0,utterance,utterance_pos,initial_consine_similarity
0,"hi joydeep sir getting error on event viewer ,...",1,1.0
1,Windows Content Delivery Manager would relate ...,2,0.069993
2,hi joydeep sir my issues resolved.thnks a lot.,3,0.147442
3,I had a couple of problems and contacted Micro...,1,1.0
4,Thank you for posting in Microsoft Community. ...,2,0.364363


In [21]:
# Dialog Similarity
dialog_similarity = []

for i,dialog in enumerate(dialogs):
    vec1 = text_to_vector(''.join(dialog))
    for u in dialog:
        vec2 = text_to_vector(u)
        dialog_similarity.append(get_cosine(vec1, vec2))
        
data_2 = {'utterance': utter, 'consine_similarity': dialog_similarity}
df_2 = pd.DataFrame(data_2)
df_2.to_csv(r'CONTENT_dialog_similarity.csv', index=False)
df_2.head()

Unnamed: 0,utterance,consine_similarity
0,"hi joydeep sir getting error on event viewer ,...",0.655034
1,Windows Content Delivery Manager would relate ...,0.750313
2,hi joydeep sir my issues resolved.thnks a lot.,0.313264
3,I had a couple of problems and contacted Micro...,0.555694
4,Thank you for posting in Microsoft Community. ...,0.946031


In [22]:
# Question Mark
question_mark = []

for i in range(len(utter)):
    has_question_mark = '?' in utter[i]
    question_mark.append(has_question_mark)

data_3 = {'utterance': utter, 'question_mark': question_mark}
df_3 = pd.DataFrame(data_3)
df_3.to_csv(r'CONTENT_question_mark.csv', index=False)
df_3.head()

Unnamed: 0,utterance,question_mark
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,False


In [23]:
# Duplicate
duplicate = []

for i in range(len(utter)):
    has_duplicate = 'same' in utter[i].lower() or 'similar' in utter[i].lower()
    duplicate.append(has_duplicate)
    
data_4 = {'utterance': utter, 'duplicate': duplicate}
df_4 = pd.DataFrame(data_4)
df_4.to_csv(r'CONTENT_duplicate.csv', index=False)
df_4.head()

Unnamed: 0,utterance,duplicate
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,False


In [60]:
# 5W1H
w_h = np.zeros([len(utter), 6])

for i in range(len(utter)):
    one_hot = [0, 0, 0, 0, 0, 0]
    if 'what' in utter[i]:
        one_hot[0] = 1
    if 'where' in utter[i]:
        one_hot[1] = 1
    if 'when' in utter[i]:
        one_hot[2] = 1
    if 'why' in utter[i]:
        one_hot[3] = 1
    if 'who' in utter[i]:
        one_hot[4] = 1
    if 'how' in utter[i]:
        one_hot[5] = 1
    w_h[i] = one_hot
    
data_5 = {'utterance': utter, 'what': w_h[:,0], 'where': w_h[:,1], 'when': w_h[:,2], 'why': w_h[:,3], 'who': w_h[:,4], 'how': w_h[:,5]}
df_5 = pd.DataFrame(data_5)
df_5.to_csv(r'CONTENT_5W_1H.csv', index=False)
df_5.head()

Unnamed: 0,utterance,what,where,when,why,who,how
0,"hi joydeep sir getting error on event viewer ,...",0.0,0.0,0.0,0.0,0.0,0.0
1,Windows Content Delivery Manager would relate ...,0.0,0.0,0.0,0.0,0.0,0.0
2,hi joydeep sir my issues resolved.thnks a lot.,0.0,0.0,0.0,0.0,0.0,0.0
3,I had a couple of problems and contacted Micro...,0.0,1.0,1.0,0.0,1.0,0.0
4,Thank you for posting in Microsoft Community. ...,0.0,1.0,0.0,0.0,0.0,0.0


# Structual features

In [25]:
# Absolute Position
abs_pos = []
count = 1

for i, dialog in enumerate(dialogs):
    count = 1
    for u in dialog:
        abs_pos.append(count)
        count += 1
        
data_6 = {'utterance': utter, 'absolute_pos': abs_pos}
df_6 = pd.DataFrame(data_6)
df_6.to_csv(r'STRUC_absolute_position.csv', index=False)
df_6.head()

Unnamed: 0,utterance,absolute_pos
0,"hi joydeep sir getting error on event viewer ,...",1
1,Windows Content Delivery Manager would relate ...,2
2,hi joydeep sir my issues resolved.thnks a lot.,3
3,I had a couple of problems and contacted Micro...,1
4,Thank you for posting in Microsoft Community. ...,2


In [26]:
# Normalized Position
norm_pos = []
count = 1

for i, dialog in enumerate(dialogs):
    count = 1
    for u in dialog:
        norm_pos.append(count/len(dialog))
        count += 1
        
data_7 = {'utterance': utter, 'norm_pos': norm_pos}
df_7 = pd.DataFrame(data_7)
df_7.to_csv(r'STRUC_normalized_position.csv', index=False)
df_7.head()

Unnamed: 0,utterance,norm_pos
0,"hi joydeep sir getting error on event viewer ,...",0.333333
1,Windows Content Delivery Manager would relate ...,0.666667
2,hi joydeep sir my issues resolved.thnks a lot.,1.0
3,I had a couple of problems and contacted Micro...,0.2
4,Thank you for posting in Microsoft Community. ...,0.4


In [27]:
# Utterance Length
utter_len = []

for u in utter_without_sw:
    u_without_punctuation = re.sub(r'[^\w\s]','',u)
    utter_len.append(len(u_without_punctuation))

data_8 = {'utterance': utter, 'utter_len': utter_len}
df_8 = pd.DataFrame(data_8)
df_8.to_csv(r'STRUC_utterance_length.csv', index=False)
df_8.head()

Unnamed: 0,utterance,utter_len
0,"hi joydeep sir getting error on event viewer ,...",136
1,Windows Content Delivery Manager would relate ...,392
2,hi joydeep sir my issues resolved.thnks a lot.,40
3,I had a couple of problems and contacted Micro...,375
4,Thank you for posting in Microsoft Community. ...,1563


In [28]:
# Utterance Length Unique
utter_len_unique = []

for u in utter_without_sw:
    if pd.isna(u):
        u = ''
    u_without_punctuation = re.sub(r'[^\w\s]','',u)
    utter_len_unique.append(len(set(word_tokenize(u_without_punctuation.lower()))))

data_9 = {'utterance': utter, 'utter_len_unique': utter_len_unique}
df_9 = pd.DataFrame(data_9)
df_9.to_csv(r'STRUC_utterance_length_unique.csv', index=False)
df_9.head()

Unnamed: 0,utterance,utter_len_unique
0,"hi joydeep sir getting error on event viewer ,...",23
1,Windows Content Delivery Manager would relate ...,41
2,hi joydeep sir my issues resolved.thnks a lot.,6
3,I had a couple of problems and contacted Micro...,43
4,Thank you for posting in Microsoft Community. ...,132


In [29]:
# Utterance Length Stemmed Unique
utter_len_stemmed_unique = []

for u in utter_without_sw_stem:
    u_without_punctuation = re.sub(r'[^\w\s]','',u)
    utter_len_stemmed_unique.append(len(set(word_tokenize(u_without_punctuation.lower()))))

data_10 = {'utterance': utter, 'utter_len_unique': utter_len_stemmed_unique}
df_10 = pd.DataFrame(data_10)
df_10.to_csv(r'STRUC_utterance_length_stemmed_unique.csv', index=False)
df_10.head()

Unnamed: 0,utterance,utter_len_unique
0,"hi joydeep sir getting error on event viewer ,...",23
1,Windows Content Delivery Manager would relate ...,41
2,hi joydeep sir my issues resolved.thnks a lot.,6
3,I had a couple of problems and contacted Micro...,41
4,Thank you for posting in Microsoft Community. ...,126


In [30]:
# Is Starter
data_11 = {'utterance': utter, 'is_starter': is_user}
df_11 = pd.DataFrame(data_11)
df_11.to_csv(r'STRUC_is_starter.csv', index=False)
df_11.head()

Unnamed: 0,utterance,is_starter
0,"hi joydeep sir getting error on event viewer ,...",True
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,True
3,I had a couple of problems and contacted Micro...,True
4,Thank you for posting in Microsoft Community. ...,False


# Sentiment features

In [31]:
# Thank
thank = []

for i in range(len(utter)):
    has_thank = 'thank' in utter[i].lower()
    thank.append(has_thank)

data_12 = {'utterance': utter, 'contain_thank': thank}
df_12 = pd.DataFrame(data_12)
df_12.to_csv(r'SENTI_thank.csv', index=False)
df_12.head()

Unnamed: 0,utterance,contain_thank
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,True


In [32]:
# Exclamation Mark
exclamation_mark = []

for i in range(len(utter)):
    has_exclamation_mark = '!' in utter[i]
    exclamation_mark.append(has_exclamation_mark)

data_13 = {'utterance': utter, 'exclamation_mark': exclamation_mark}
df_13 = pd.DataFrame(data_13)
df_13.to_csv(r'SENTI_exclamation_mark.csv', index=False)
df_13.head()

Unnamed: 0,utterance,exclamation_mark
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,False


In [33]:
# Feedback
feedback = []

for i in range(len(utter)):
    has_feedback = 'did not' in utter[i].lower() or 'does not' in utter[i].lower()
    feedback.append(has_feedback)
    
data_14 = {'utterance': utter, 'feedback': feedback}
df_14 = pd.DataFrame(data_14)
df_14.to_csv(r'SENTI_feedback.csv', index=False)
df_14.head()

Unnamed: 0,utterance,feedback
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,False


In [34]:
# Sentiment Scores
neg = []
neu = []
pos = []
compound = []

analyser = SentimentIntensityAnalyzer()

for u in utter:
    score = analyser.polarity_scores(u)
    neg.append(score['neg'])
    neu.append(score['neu'])
    pos.append(score['pos'])
    compound.append(score['compound'])
    
data_15 = {'utterance': utter, 'neg': neg, 'neu': neu, 'pos': pos, 'compound': compound}
df_15 = pd.DataFrame(data_15)
df_15.to_csv(r'SENTI_sentiment_scores.csv', index=False)
df_15.head()

Unnamed: 0,utterance,neg,neu,pos,compound
0,"hi joydeep sir getting error on event viewer ,...",0.129,0.714,0.157,0.0516
1,Windows Content Delivery Manager would relate ...,0.0,0.953,0.047,0.3818
2,hi joydeep sir my issues resolved.thnks a lot.,0.0,1.0,0.0,0.0
3,I had a couple of problems and contacted Micro...,0.039,0.803,0.158,0.9042
4,Thank you for posting in Microsoft Community. ...,0.007,0.847,0.146,0.9899


In [35]:
# Opinion Lexicon

In [61]:
features = df_1
dfs = [df_2,df_3,df_4,df_5,df_6,df_7,df_8,df_9,df_10,df_11,df_12,df_13,df_14,df_15]

for d in dfs:
    names = list(d.columns)
    names.remove('utterance')
    features = pd.merge(features,d[names], left_index=True, right_index=True)

In [62]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(features, pd.DataFrame(lab, columns = label_list), test_size=0.1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1)

In [63]:
print('train size:', x_train.shape, y_train.shape)
print('val size:', x_val.shape, y_val.shape)
print('test size:', x_test.shape, y_test.shape)

train size: (8116, 25) (8116, 32)
val size: (1002, 25) (1002, 32)
test size: (902, 25) (902, 32)


In [64]:
x_train.head()

Unnamed: 0,utterance,utterance_pos,initial_consine_similarity,consine_similarity,question_mark,duplicate,what,where,when,why,...,utter_len_unique_x,utter_len_unique_y,is_starter,contain_thank,exclamation_mark,feedback,neg,neu,pos,compound
284,"Thank you so much! I really appreciate it, th...",5,0.306578,0.472081,False,False,0.0,0.0,0.0,0.0,...,11,11,True,True,True,False,0.0,0.595,0.405,0.8904
7253,Thanks. I think I've got it now. Problem solved.,3,0.311651,0.132578,False,False,0.0,0.0,0.0,0.0,...,7,7,True,True,False,False,0.197,0.438,0.365,0.3182
6696,I re-installed Quick Time player and the probl...,2,0.056796,0.511208,False,False,0.0,0.0,0.0,0.0,...,7,7,True,False,True,False,0.256,0.599,0.145,-0.3164
7087,I would suggest you run the Microsoft Genuine ...,4,0.124784,0.251562,True,False,0.0,0.0,0.0,0.0,...,17,17,False,False,False,False,0.0,0.92,0.08,0.25
671,"Hi YvonneWeber, You may also refer to these F...",3,0.194029,0.761972,True,False,0.0,0.0,0.0,0.0,...,21,21,False,True,False,False,0.0,0.675,0.325,0.9179


In [51]:
y_train.head()

Unnamed: 0,O,CQ,PA,OQ,PF,PA FD,FD,GG,IR PA,FD NF,...,JK,FQ RQ,FD PF,CQ PA,FQ IR,FQ CQ,IR FD,CQ IR PA,RQ OQ,IR OQ
2511,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1596,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5133,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2727,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
features.to_csv('data/x.csv', index=None)
pd.DataFrame(lab, columns = label_list).to_csv('data/y.csv', index=None)
x_train.to_csv('data/x_train.csv', index=None)
y_train.to_csv('data/y_train.csv', index=None)
x_val.to_csv('data/x_val.csv', index=None)
y_val.to_csv('data/y_val.csv', index=None)
x_test.to_csv('data/x_test.csv', index=None)
y_test.to_csv('data/y_test.csv', index=None)