In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import math
from collections import Counter
from tqdm import tqdm as tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon
#from sklearn import datasets

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/Liuzhaoyu/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [2]:
D_intent = pd.read_json('MSDialog-Intent.json')

In [3]:
dialogs = []
labels = []
actors = []
for d in D_intent.loc['utterances']:
    dialogs.append([])
    labels.append([])
    actors.append([])
    for utter in d:
        dialogs[-1].append(utter['utterance'])
        labels[-1].append(utter['tags'])
        actors[-1].append(utter['actor_type'])

In [4]:
from sklearn.model_selection import train_test_split
dia_train, dia_val, labs_train, labs_val = train_test_split(dialogs, labels, test_size=0.1)
dia_train, dia_test, labs_train, labs_test = train_test_split(dia_train, labs_train, test_size=0.1)

In [5]:
def split(dia, labs):
    utter = []
    lab = []
    for i in range(len(dia)):
        utter += dia[i]
        lab += labs[i]
    return utter, lab

In [6]:
utter, lab = split(dialogs, labels)
utter_train, lab_train = split(dia_train, labs_train)
utter_val, lab_val = split(dia_val, labs_val)
utter_test, lab_test = split(dia_test, labs_test)

In [128]:
utter_without_sw = []
for u in tqdm(utter):
    text_tokens = word_tokenize(u)
    text_without_sw = ' '.join([word for word in text_tokens if not word in stopwords.words()])
    utter_without_sw.append(text_without_sw)

100%|██████████| 10020/10020 [49:03<00:00,  4.21it/s] 


In [136]:
df_utter_without_sw = pd.DataFrame(utter_without_sw)
df_utter_without_sw.to_csv(r'utter_without_sw.csv', index=False)

In [139]:
utter_without_sw_stem = []
ps = PorterStemmer() 
for u in tqdm(utter_without_sw):
    text_tokens = word_tokenize(u)
    text_without_sw_stem = ' '.join([ps.stem(w) for w in text_tokens])
    utter_without_sw_stem.append(text_without_sw_stem)

100%|██████████| 10020/10020 [00:14<00:00, 711.22it/s]


In [155]:
is_user = []
for a in actors:
    is_user += a
for i, u in enumerate(is_user):
    if u == 'User':
        is_user[i] = True
    else:
        is_user[i] = False

In [None]:
###

In [41]:
print('train size:', len(utter_train))
print('val size:', len(utter_val))
print('test size:', len(utter_test))

train size: 8090
val size: 1025
test size: 905


In [42]:
for i in range(len(lab)):
    ls = lab[i].split()
    if 'GG' in ls and len(ls)>1:
        ls.remove('GG')
        lab[i] = ' '.join(ls)
    if 'JK' in ls and len(ls)>1:
        ls.remove('JK')
        lab[i] = ' '.join(ls)
    if 'O' in ls and len(ls)>1:
        ls.remove('O')
        lab[i] = ' '.join(ls)
    lab[i] = lab[i].strip()

In [43]:
import collections
lab_freq = dict(collections.Counter(lab))

In [44]:
sorted(lab_freq.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)[:32]

[('PA', 2481),
 ('OQ', 1866),
 ('PF', 559),
 ('PA FD', 523),
 ('FD', 500),
 ('GG', 346),
 ('IR PA', 227),
 ('FD NF', 198),
 ('FD OQ', 169),
 ('IR', 165),
 ('RQ', 153),
 ('PA IR', 144),
 ('PF FD', 140),
 ('NF', 140),
 ('FQ FD', 137),
 ('CQ FD', 116),
 ('FQ', 101),
 ('PA PF', 98),
 ('FD RQ', 87),
 ('CQ IR', 81),
 ('JK', 75),
 ('FQ RQ', 71),
 ('FD PF', 67),
 ('CQ PA', 62),
 ('FQ IR', 58),
 ('FQ CQ', 55),
 ('IR FD', 54),
 ('CQ IR PA', 51),
 ('RQ OQ', 49),
 ('IR OQ', 48),
 ('NF OQ', 47),
 ('FQ FD NF', 44)]

In [45]:
label_list = ['O', 'CQ']
for l in sorted(lab_freq.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)[:31]:
    label_list.append(l[0])

In [47]:
def preprocess(lab):
    new_lab = np.zeros([len(lab), len(label_list)])
    for i, l in enumerate(lab):
        for j, l_ in enumerate(label_list):
            if l_ in l:
                new_lab[i,j] = 1
    return new_lab

In [33]:
lab = preprocess(lab)#labels of all data
lab_train = preprocess(lab_train)#labels of train data
lab_val = preprocess(lab_val)#labels of val data
lab_test = preprocess(lab_test)#labels of test data

In [14]:
utter_train[0]

'how to get print bigger'

In [15]:
dia_train[0]

['how to get print bigger',
 'Hi Michael Take a look at this. https://support.microsoft.com/en-us/kb/192391 After increasing font size press ctrl+P to print.',
 'Hi   Assuming that you are using Internet Explorer and are referring to "print" on the screen (rather than on paper) then these keyboard shortcuts will adjust the zoom for the whole page:  CTRL + +        Zoom In  CTRL + -         Zoom Out  CTRL + 0        Set Zoom to 100%.  If you are using Internet Explorer you may click Page (on Command Bar) then Text size then select the text size you prefer.  The zoom controls above work with many other web browsers too.']

In [20]:
lab_train[0]

array([1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [21]:
label_list

['O',
 'CQ',
 'PA',
 'OQ',
 'PF',
 'PA FD',
 'FD',
 'GG',
 'IR PA',
 'FD NF',
 'FD OQ',
 'IR',
 'RQ',
 'PA IR',
 'PF FD',
 'NF',
 'FQ FD',
 'CQ FD',
 'FQ',
 'PA PF',
 'FD RQ',
 'CQ IR',
 'JK',
 'FQ RQ',
 'FD PF',
 'CQ PA',
 'FQ IR',
 'FQ CQ',
 'IR FD',
 'CQ IR PA',
 'RQ OQ',
 'IR OQ',
 'NF OQ']

In [51]:
def text_to_vector(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    return Counter(words)

In [52]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

# Feaure function

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import math
from collections import Counter
from tqdm import tqdm as tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon
#from sklearn import datasets

In [82]:
def get_features(utter, dialogs, actors):
    df_utter_without_sw = pd.read_csv('utter_without_sw.csv')
    utter_without_sw = df_utter_without_sw['0'].tolist()
    utter_without_sw_stem = remove_stem(utter_without_sw)
    
    # Initial Utterance Similarity
    InitSim = []
    for i, dialog in enumerate(dialogs):
        vec1 = text_to_vector(dialog[0])
        for u in dialog:
            vec2 = text_to_vector(u)
            InitSim.append(get_cosine(vec1, vec2))
    
    print('init sim')
    
    # Dialog Similarity
    DlgSim = []
    for i,dialog in enumerate(dialogs):
        vec1 = text_to_vector(''.join(dialog))
        for u in dialog:
            vec2 = text_to_vector(u)
            DlgSim.append(get_cosine(vec1, vec2))
            
    print('dlg sim')
            
    # Question Mark
    QuestMark = []
    for i in range(len(utter)):
        has_question_mark = '?' in utter[i]
        QuestMark.append(has_question_mark)
        
    print('quest mark')
        
    # Duplicate
    Dup = []
    for i in range(len(utter)):
        has_duplicate = 'same' in utter[i].lower() or 'similar' in utter[i].lower()
        Dup.append(has_duplicate)
        
    print('dup')    
    
    # 5W1H
    What = []
    Where = []
    When = []
    Why = []
    Who = []
    How = []
    for i in range(len(utter)):
        What.append('what' in utter[i].lower())
        Where.append('where' in utter[i].lower())
        When.append('when' in utter[i].lower())
        Why.append('why' in utter[i].lower())
        Who.append('who' in utter[i].lower())
        How.append('how' in utter[i].lower())
        
    print('5w1h')
    
    # Absolute Position
    AbsPos = []
    for i, dialog in enumerate(dialogs):
        count = 1
        for u in dialog:
            AbsPos.append(count)
            count += 1
    
    print('abs pos')
    
    # Normalized Position
    NormPos = []
    for i, dialog in enumerate(dialogs):
        count = 1
        for u in dialog:
            NormPos .append(count/len(dialog))
            count += 1
            
    print('norm pos')
    
    # Utterance Length
    Len = []
    for u in utter_without_sw:
        if not isinstance(u, str):
            Len.append(0)
            continue
        u_without_punctuation = re.sub(r'[^\w\s]','',u)
        Len.append(len(u_without_punctuation))
        
    print('len')
        
    # Utterance Length Unique
    LenUni = []
    for u in utter_without_sw:
        if not isinstance(u, str):
            LenUni.append(0)
            continue
        u_without_punctuation = re.sub(r'[^\w\s]','',u)
        LenUni.append(len(set(word_tokenize(u_without_punctuation.lower()))))
        
    print('len uni')
        
    # Utterance Length Stemmed Unique
    LenStem = []
    for u in utter_without_sw_stem:
        if not isinstance(u, str):
            LenStem.append(0)
            continue
        u_without_punctuation = re.sub(r'[^\w\s]','',u)
        LenStem.append(len(set(word_tokenize(u_without_punctuation.lower()))))
        
    print('len stem')
        
    # Is Starter
    Starter = is_user(actors)
    
    print('starter')
    
    # Thank
    Thank = []
    for i in range(len(utter)):
        has_thank = 'thank' in utter[i].lower()
        Thank.append(has_thank)
        
    print('thank')
        
    # Exclamation Mark
    ExMark = []
    for i in range(len(utter)):
        has_exclamation_mark = '!' in utter[i]
        ExMark.append(has_exclamation_mark)
        
    print('ex mark')
        
    # Feedback
    Feedback = []
    for i in range(len(utter)):
        has_feedback = 'did not' in utter[i].lower() or 'does not' in utter[i].lower()
        Feedback.append(has_feedback)
        
    print('feedback')
        
    # Sentiment Scores
    SenScr_Neu = []
    SenScr_Pos = []
    analyser = SentimentIntensityAnalyzer()
    for u in tqdm(utter):
        score = analyser.polarity_scores(u)
        SenScr_Neu.append(score['neu'])
        SenScr_Pos.append(score['pos'])
        
    print('sen scr')
        
    # Opinion Lexicon
    Lex_Pos = []
    Lex_Neg = []
    for u in tqdm(utter):
        pos = 0
        neg = 0
        for word in u.split():
            if word.lower() in opinion_lexicon.positive():
                pos += 1
            if word.lower() in opinion_lexicon.negative():
                neg += 1
        Lex_Pos.append(pos)
        Lex_Neg.append(neg)
        
    print('opinion lexicon')
        
    data_all = {'utterance': utter, 'InitSim': InitSim, 'DlgSim': DlgSim, 'QuestMark': QuestMark, 
                'Dup': Dup, 'What': What, 'Where': Where, 'When': When, 'Why': Why, 'Who': Who, 'How': How, 
                'AbsPos': AbsPos, 'NormPos': NormPos, 'Len': Len, 'LenUni': LenUni, 'LenStem': LenStem,
                'Starter': Starter, 'Thank': Thank, 'ExMark': ExMark, 'Feedback': Feedback, 'SenScr_Neu': SenScr_Neu,
                'SenScr_Pos': SenScr_Pos, 'Lex_Pos': Lex_Pos, 'Lex_Neg': Lex_Neg}
    df_all = pd.DataFrame(data_all)
    df_all.to_csv(r'all_features.csv', index=False)
    df_all.head()

In [83]:
def remove_sw(utter):
    utter_without_sw = []
    for u in tqdm(utter):
        text_tokens = word_tokenize(u)
        text_without_sw = ' '.join([word for word in text_tokens if not word in stopwords.words()])
        utter_without_sw.append(text_without_sw)
    return utter_without_sw

def remove_stem(utter_without_sw):
    utter_without_sw_stem = []
    ps = PorterStemmer() 
    for i, u in tqdm(enumerate(utter_without_sw)):
        if not isinstance(u, str):
            utter_without_sw_stem.append("")
            continue
        text_tokens = word_tokenize(u)
        text_without_sw_stem = ' '.join([ps.stem(w) for w in text_tokens])
        utter_without_sw_stem.append(text_without_sw_stem)
    return utter_without_sw_stem

def is_user(actors):
    is_user = []
    for a in actors:
        is_user += a
    for i, u in enumerate(is_user):
        if u == 'User':
            is_user[i] = True
        else:
            is_user[i] = False
    return is_user

def text_to_vector(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    return Counter(words)

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [84]:
get_features(utter, dialogs, actors)












0it [00:00, ?it/s][A[A[A[A[A[A[A[A[A[A[A










55it [00:00, 549.92it/s][A[A[A[A[A[A[A[A[A[A[A










102it [00:00, 518.62it/s][A[A[A[A[A[A[A[A[A[A[A










146it [00:00, 477.24it/s][A[A[A[A[A[A[A[A[A[A[A










209it [00:00, 512.65it/s][A[A[A[A[A[A[A[A[A[A[A










265it [00:00, 523.09it/s][A[A[A[A[A[A[A[A[A[A[A










328it [00:00, 541.89it/s][A[A[A[A[A[A[A[A[A[A[A










403it [00:00, 589.28it/s][A[A[A[A[A[A[A[A[A[A[A










460it [00:00, 573.50it/s][A[A[A[A[A[A[A[A[A[A[A










533it [00:00, 611.06it/s][A[A[A[A[A[A[A[A[A[A[A










594it [00:01, 609.00it/s][A[A[A[A[A[A[A[A[A[A[A










661it [00:01, 623.12it/s][A[A[A[A[A[A[A[A[A[A[A










724it [00:01, 587.76it/s][A[A[A[A[A[A[A[A[A[A[A










791it [00:01, 602.06it/s][A[A[A[A[A[A[A[A[A[A[A










852it [00:01, 590.20it/s][A[A

7469it [00:12, 579.00it/s][A[A[A[A[A[A[A[A[A[A[A










7546it [00:12, 624.19it/s][A[A[A[A[A[A[A[A[A[A[A










7610it [00:12, 602.81it/s][A[A[A[A[A[A[A[A[A[A[A










7672it [00:13, 571.96it/s][A[A[A[A[A[A[A[A[A[A[A










7733it [00:13, 581.60it/s][A[A[A[A[A[A[A[A[A[A[A










7792it [00:13, 574.80it/s][A[A[A[A[A[A[A[A[A[A[A










7868it [00:13, 618.01it/s][A[A[A[A[A[A[A[A[A[A[A










7941it [00:13, 643.45it/s][A[A[A[A[A[A[A[A[A[A[A










8013it [00:13, 664.06it/s][A[A[A[A[A[A[A[A[A[A[A










8081it [00:13, 646.05it/s][A[A[A[A[A[A[A[A[A[A[A










8155it [00:13, 670.40it/s][A[A[A[A[A[A[A[A[A[A[A










8223it [00:13, 624.97it/s][A[A[A[A[A[A[A[A[A[A[A










8287it [00:13, 583.45it/s][A[A[A[A[A[A[A[A[A[A[A










8347it [00:14, 573.57it/s][A[A[A[A[A[A[A[A[A[A[A










8412it [00:14, 594.3

init sim
dlg sim
quest mark
dup
5w1h
abs pos
norm pos
len
len uni













  0%|          | 0/10020 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A










  1%|          | 94/10020 [00:00<00:10, 932.75it/s][A[A[A[A[A[A[A[A[A[A[A

len stem
starter
thank
ex mark
feedback













  2%|▏         | 173/10020 [00:00<00:11, 883.81it/s][A[A[A[A[A[A[A[A[A[A[A










  3%|▎         | 267/10020 [00:00<00:10, 896.88it/s][A[A[A[A[A[A[A[A[A[A[A










  4%|▍         | 410/10020 [00:00<00:09, 1009.01it/s][A[A[A[A[A[A[A[A[A[A[A










  6%|▌         | 571/10020 [00:00<00:08, 1135.80it/s][A[A[A[A[A[A[A[A[A[A[A










  7%|▋         | 703/10020 [00:00<00:07, 1184.52it/s][A[A[A[A[A[A[A[A[A[A[A










  8%|▊         | 843/10020 [00:00<00:07, 1231.76it/s][A[A[A[A[A[A[A[A[A[A[A










 10%|█         | 1011/10020 [00:00<00:06, 1338.89it/s][A[A[A[A[A[A[A[A[A[A[A










 11%|█▏        | 1146/10020 [00:00<00:06, 1324.55it/s][A[A[A[A[A[A[A[A[A[A[A










 13%|█▎        | 1280/10020 [00:01<00:06, 1279.99it/s][A[A[A[A[A[A[A[A[A[A[A










 14%|█▍        | 1421/10020 [00:01<00:06, 1310.81it/s][A[A[A[A[A[A[A[A[A[A[A










 15%|█▌        | 

sen scr













  0%|          | 1/10020 [00:00<1:52:07,  1.49it/s][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 2/10020 [00:01<2:08:57,  1.29it/s][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 3/10020 [00:01<1:37:47,  1.71it/s][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 4/10020 [00:03<2:24:32,  1.15it/s][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 5/10020 [00:08<6:09:20,  2.21s/it][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 6/10020 [00:09<4:40:38,  1.68s/it][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 7/10020 [00:09<3:55:32,  1.41s/it][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 8/10020 [00:10<3:11:54,  1.15s/it][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 9/10020 [00:11<3:06:14,  1.12s/it][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 10/10020 [00:12<2:49:04,  1.01s/it][A[A[A[A[A[A[A[A[A[A[A










  0%|          | 11/10020 [00:12<2:04:

  1%|          | 90/10020 [01:50<2:03:38,  1.34it/s][A[A[A[A[A[A[A[A[A[A[A










  1%|          | 91/10020 [01:51<2:23:30,  1.15it/s][A[A[A[A[A[A[A[A[A[A[A










  1%|          | 92/10020 [01:57<6:40:02,  2.42s/it][A[A[A[A[A[A[A[A[A[A[A

KeyboardInterrupt: 

# Content features

In [171]:
# Initial Utterance Similarity
initial_utter_similarity = []
pos = []
count = 1

for i, dialog in enumerate(dialogs):
    vec1 = text_to_vector(dialog[0])
    count = 1
    for u in dialog:
        pos.append(count)
        count += 1
        vec2 = text_to_vector(u)
        initial_utter_similarity.append(get_cosine(vec1, vec2))
        
data_1 = {'utterance': utter, 'utterance_pos': pos, 'consine_similarity': initial_utter_similarity}
df_1 = pd.DataFrame(data_1)
df_1.to_csv(r'CONTENT_initial_utter_similarity.csv', index=False)
df_1.head()

Unnamed: 0,utterance,utterance_pos,consine_similarity
0,"hi joydeep sir getting error on event viewer ,...",1,1.0
1,Windows Content Delivery Manager would relate ...,2,0.069993
2,hi joydeep sir my issues resolved.thnks a lot.,3,0.147442
3,I had a couple of problems and contacted Micro...,1,1.0
4,Thank you for posting in Microsoft Community. ...,2,0.364363


In [172]:
# Dialog Similarity
dialog_similarity = []

for i,dialog in enumerate(dialogs):
    vec1 = text_to_vector(''.join(dialog))
    for u in dialog:
        vec2 = text_to_vector(u)
        dialog_similarity.append(get_cosine(vec1, vec2))
        
data_2 = {'utterance': utter, 'consine_similarity': dialog_similarity}
df_2 = pd.DataFrame(data_2)
df_2.to_csv(r'CONTENT_dialog_similarity.csv', index=False)
df_2.head()

Unnamed: 0,utterance,consine_similarity
0,"hi joydeep sir getting error on event viewer ,...",0.655034
1,Windows Content Delivery Manager would relate ...,0.750313
2,hi joydeep sir my issues resolved.thnks a lot.,0.313264
3,I had a couple of problems and contacted Micro...,0.555694
4,Thank you for posting in Microsoft Community. ...,0.946031


In [173]:
# Question Mark
question_mark = []

for i in range(len(utter)):
    has_question_mark = '?' in utter[i]
    question_mark.append(has_question_mark)

data_3 = {'utterance': utter, 'question_mark': question_mark}
df_3 = pd.DataFrame(data_3)
df_3.to_csv(r'CONTENT_question_mark.csv', index=False)
df_3.head()

Unnamed: 0,utterance,question_mark
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,False


In [174]:
# Duplicate
duplicate = []

for i in range(len(utter)):
    has_duplicate = 'same' in utter[i].lower() or 'similar' in utter[i].lower()
    duplicate.append(has_duplicate)
    
data_4 = {'utterance': utter, 'duplicate': duplicate}
df_4 = pd.DataFrame(data_4)
df_4.to_csv(r'CONTENT_duplicate.csv', index=False)
df_4.head()

Unnamed: 0,utterance,duplicate
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,False


In [175]:
# 5W1H
w_h = []

for i in range(len(utter)):
    one_hot = [0, 0, 0, 0, 0, 0]
    if 'what' in utter[i]:
        one_hot[0] = 1
    if 'where' in utter[i]:
        one_hot[1] = 1
    if 'when' in utter[i]:
        one_hot[2] = 1
    if 'why' in utter[i]:
        one_hot[3] = 1
    if 'who' in utter[i]:
        one_hot[4] = 1
    if 'how' in utter[i]:
        one_hot[5] = 1
    w_h.append(one_hot)
    
data_5 = {'utterance': utter, '5W1H': w_h}
df_5 = pd.DataFrame(data_5)
df_5.to_csv(r'CONTENT_5W_1H.csv', index=False)
df_5.head()

Unnamed: 0,utterance,5W1H
0,"hi joydeep sir getting error on event viewer ,...","[0, 0, 0, 0, 0, 0]"
1,Windows Content Delivery Manager would relate ...,"[0, 0, 0, 0, 0, 0]"
2,hi joydeep sir my issues resolved.thnks a lot.,"[0, 0, 0, 0, 0, 0]"
3,I had a couple of problems and contacted Micro...,"[0, 1, 1, 0, 1, 0]"
4,Thank you for posting in Microsoft Community. ...,"[0, 1, 0, 0, 0, 0]"


# Structual features

In [176]:
# Absolute Position
abs_pos = []
count = 1

for i, dialog in enumerate(dialogs):
    count = 1
    for u in dialog:
        abs_pos.append(count)
        count += 1
        
data_6 = {'utterance': utter, 'absolute_pos': abs_pos}
df_6 = pd.DataFrame(data_6)
df_6.to_csv(r'STRUC_absolute_position.csv', index=False)
df_6.head()

Unnamed: 0,utterance,absolute_pos
0,"hi joydeep sir getting error on event viewer ,...",1
1,Windows Content Delivery Manager would relate ...,2
2,hi joydeep sir my issues resolved.thnks a lot.,3
3,I had a couple of problems and contacted Micro...,1
4,Thank you for posting in Microsoft Community. ...,2


In [177]:
# Normalized Position
norm_pos = []
count = 1

for i, dialog in enumerate(dialogs):
    count = 1
    for u in dialog:
        norm_pos.append(count/len(dialog))
        count += 1
        
data_7 = {'utterance': utter, 'norm_pos': norm_pos}
df_7 = pd.DataFrame(data_7)
df_7.to_csv(r'STRUC_normalized_position.csv', index=False)
df_7.head()

Unnamed: 0,utterance,norm_pos
0,"hi joydeep sir getting error on event viewer ,...",0.333333
1,Windows Content Delivery Manager would relate ...,0.666667
2,hi joydeep sir my issues resolved.thnks a lot.,1.0
3,I had a couple of problems and contacted Micro...,0.2
4,Thank you for posting in Microsoft Community. ...,0.4


In [178]:
# Utterance Length
utter_len = []

for u in utter_without_sw:
    u_without_punctuation = re.sub(r'[^\w\s]','',u)
    utter_len.append(len(u_without_punctuation))

data_8 = {'utterance': utter, 'utter_len': utter_len}
df_8 = pd.DataFrame(data_8)
df_8.to_csv(r'STRUC_utterance_length.csv', index=False)
df_8.head()

Unnamed: 0,utterance,utter_len
0,"hi joydeep sir getting error on event viewer ,...",136
1,Windows Content Delivery Manager would relate ...,392
2,hi joydeep sir my issues resolved.thnks a lot.,40
3,I had a couple of problems and contacted Micro...,375
4,Thank you for posting in Microsoft Community. ...,1563


In [179]:
# Utterance Length Unique
utter_len_unique = []

for u in utter_without_sw:
    u_without_punctuation = re.sub(r'[^\w\s]','',u)
    utter_len_unique.append(len(set(word_tokenize(u_without_punctuation.lower()))))

data_9 = {'utterance': utter, 'utter_len_unique': utter_len_unique}
df_9 = pd.DataFrame(data_9)
df_9.to_csv(r'STRUC_utterance_length_unique.csv', index=False)
df_9.head()

Unnamed: 0,utterance,utter_len_unique
0,"hi joydeep sir getting error on event viewer ,...",23
1,Windows Content Delivery Manager would relate ...,41
2,hi joydeep sir my issues resolved.thnks a lot.,6
3,I had a couple of problems and contacted Micro...,43
4,Thank you for posting in Microsoft Community. ...,132


In [180]:
# Utterance Length Stemmed Unique
utter_len_stemmed_unique = []

for u in utter_without_sw_stem:
    u_without_punctuation = re.sub(r'[^\w\s]','',u)
    utter_len_stemmed_unique.append(len(set(word_tokenize(u_without_punctuation.lower()))))

data_10 = {'utterance': utter, 'utter_len_unique': utter_len_stemmed_unique}
df_10 = pd.DataFrame(data_10)
df_10.to_csv(r'STRUC_utterance_length_stemmed_unique.csv', index=False)
df_10.head()

Unnamed: 0,utterance,utter_len_unique
0,"hi joydeep sir getting error on event viewer ,...",23
1,Windows Content Delivery Manager would relate ...,41
2,hi joydeep sir my issues resolved.thnks a lot.,6
3,I had a couple of problems and contacted Micro...,41
4,Thank you for posting in Microsoft Community. ...,126


In [181]:
# Is Starter
data_11 = {'utterance': utter, 'is_starter': is_user}
df_11 = pd.DataFrame(data_11)
df_11.to_csv(r'STRUC_is_starter.csv', index=False)
df_11.head()

Unnamed: 0,utterance,is_starter
0,"hi joydeep sir getting error on event viewer ,...",True
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,True
3,I had a couple of problems and contacted Micro...,True
4,Thank you for posting in Microsoft Community. ...,False


# Sentiment features

In [182]:
# Thank
thank = []

for i in range(len(utter)):
    has_thank = 'thank' in utter[i].lower()
    thank.append(has_thank)

data_12 = {'utterance': utter, 'contain_thank': thank}
df_12 = pd.DataFrame(data_12)
df_12.to_csv(r'SENTI_thank.csv', index=False)
df_12.head()

Unnamed: 0,utterance,contain_thank
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,True


In [183]:
# Exclamation Mark
exclamation_mark = []

for i in range(len(utter)):
    has_exclamation_mark = '!' in utter[i]
    exclamation_mark.append(has_exclamation_mark)

data_13 = {'utterance': utter, 'exclamation_mark': exclamation_mark}
df_13 = pd.DataFrame(data_13)
df_13.to_csv(r'SENTI_exclamation_mark.csv', index=False)
df_13.head()

Unnamed: 0,utterance,exclamation_mark
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,False


In [184]:
# Feedback
feedback = []

for i in range(len(utter)):
    has_feedback = 'did not' in utter[i].lower() or 'does not' in utter[i].lower()
    feedback.append(has_feedback)
    
data_14 = {'utterance': utter, 'feedback': feedback}
df_14 = pd.DataFrame(data_14)
df_14.to_csv(r'SENTI_feedback.csv', index=False)
df_14.head()

Unnamed: 0,utterance,feedback
0,"hi joydeep sir getting error on event viewer ,...",False
1,Windows Content Delivery Manager would relate ...,False
2,hi joydeep sir my issues resolved.thnks a lot.,False
3,I had a couple of problems and contacted Micro...,False
4,Thank you for posting in Microsoft Community. ...,False


In [9]:
# Sentiment Scores
neg = []
neu = []
pos = []
compound = []

analyser = SentimentIntensityAnalyzer()

for u in tqdm(utter):
    score = analyser.polarity_scores(u)
    neg.append(score['neg'])
    neu.append(score['neu'])
    pos.append(score['pos'])
    compound.append(score['compound'])
    
data_15 = {'utterance': utter, 'neg': neg, 'neu': neu, 'pos': pos, 'compound': compound}
df_15 = pd.DataFrame(data_15)
df_15.to_csv(r'SENTI_sentiment_scores.csv', index=False)
df_15.head()

 41%|████      | 4125/10020 [00:03<00:04, 1370.26it/s]


KeyboardInterrupt: 

In [None]:
# Opinion Lexicon