In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
# from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import SGDClassifier
# from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
import sklearn
print(sklearn.__version__)

1.1.3


In [3]:
data = open("utterances_1.txt", "r").readlines()
# data = data[:10000]
len(data)

1870411

# Dataset Preparation

In [4]:

ENTITIES = ["DEVICE","DEIVCE","Device","PREFERENCE","ACTION","VAL", "VALUE","UNIT","LOCATION","STATE","SERVICE:","SERVICE","TIME","ATION", "QUANTITY"]

sentence = []
labels = []
for line in data:
    split_lines = line.split('|||')    
    sentence.append(split_lines[0].strip())
    labels.append(split_lines[1].strip())

df = pd.DataFrame({'sentences':sentence,'labels':labels})

physical_parameters = [ "Color" ,"Temperature" ,"Brightness" ,"Volume" ,"AirQuality" ,"Humidity","Channel" ,"Noise" ,"EnergyConsumption","WaterConsumption"]
effect =    ["DecreaseValue" ,"FixedValue" ,"IncreaseValue"  ,"DEcreaseValue"]
goal =      ["Direct", "Indirect"]
scope =     ["Global","Device"]
df['EFFECT'] = df['labels'].str.findall('EFFECT:(\S*).*').str[0]
df['EFFECT'] = df['EFFECT'].str.replace('DEcreaseValue','DecreaseValue')

df['PARAM'] = df['labels'].str.findall('PARAM:(\S*).*').str[0]
df['SCOPE'] = df['labels'].str.findall('SCOPE:(\S*).*').str[0]
df['GOAL'] = df['labels'].str.findall('GOAL:(\S*).*').str[0]
df.head()


Unnamed: 0,sentences,labels,EFFECT,PARAM,SCOPE,GOAL
0,help me automatically ACTION:turn the LOCATION...,EFFECT:DecreaseValue PARAM:Colour SCOPE:Global...,DecreaseValue,Colour,Global,Indirect
1,please i need DEVICE:kettle LOCATION:out_here ...,SCOPE:Device GOAL:DeviceCommand,,,Device,DeviceCommand
2,automatically ACTION:lessen my DEVICE:smartpho...,EFFECT:FixedValue PARAM:Brightness SCOPE:Devic...,FixedValue,Brightness,Device,DeviceCommand
3,ACTION:suggest_us some VAL:loving for me,SCOPE:Global GOAL:Direct,,,Global,Direct
4,i prefer to have ACTION:make this DEVICE:windo...,SCOPE:Device GOAL:DeviceCommand,,,Device,DeviceCommand


In [5]:
u = df.sentences.str.split(expand=True).stack()

df_tokenized = pd.DataFrame({
    'Sentence': u.index.get_level_values(0) + 1, 
    'Word': u.values})

In [6]:
df_tokenized[['Word','Entity']] = df_tokenized['Word'].str.split(':',1,expand=True)
df_tokenized['Word_temp'] = df_tokenized['Word']

In [7]:
df_tokenized['Word'] = np.where(df_tokenized['Word_temp'].str.isupper(),df_tokenized['Entity'],df_tokenized['Word_temp'])
df_tokenized['Entity'] = np.where(df_tokenized['Word_temp'].str.isupper(),df_tokenized['Word_temp'],df_tokenized['Entity'])
df_tokenized = df_tokenized.drop('Word_temp',axis=1)
df_tokenized['Entity'] = df_tokenized['Entity'].fillna('O')
df_tokenized.head()

Unnamed: 0,Sentence,Word,Entity
0,1,help,O
1,1,me,O
2,1,automatically,O
3,1,turn,ACTION
4,1,the,O


In [8]:
len(df_tokenized), df_tokenized['Word'].nunique(), df_tokenized['Entity'].nunique()

(18648505, 2296, 16)

In [9]:
df_tokenized['Entity'].unique()

array(['O', 'ACTION', 'LOCATION', 'VAL', 'DEVICE', 'UNIT', 'TIME',
       'STATE', 'I', 'SERVICE', 'DEIVCE', 'amazon_echo', 'ATION',
       'PREFERENCE', 'MAHDAAAAAAAAAA', 'MAHDAAAAAAAA'], dtype=object)

In [10]:
df_tokenized['Word'] = np.where(df_tokenized['Entity']=='I','I',df_tokenized['Word'])
df_tokenized['Entity'] = np.where(df_tokenized['Entity']=='I','O',df_tokenized['Entity'])

In [11]:
df_tokenized['Word'] = np.where(df_tokenized['Entity']=='amazon_echo','amazon_echo',df_tokenized['Word'])
df_tokenized['Entity'] = np.where(df_tokenized['Entity']=='amazon_echo','Device',df_tokenized['Entity'])

In [12]:
df_tokenized['Entity'].unique()

array(['O', 'ACTION', 'LOCATION', 'VAL', 'DEVICE', 'UNIT', 'TIME',
       'STATE', 'SERVICE', 'DEIVCE', 'Device', 'ATION', 'PREFERENCE',
       'MAHDAAAAAAAAAA', 'MAHDAAAAAAAA'], dtype=object)

In [13]:
df_tokenized['Entity'] = np.where(df_tokenized['Entity'].isin(['Device','DEIVCE']),'DEVICE',df_tokenized['Entity'])
df_tokenized['Entity'] = np.where(df_tokenized['Entity']=='ATION','ACTION',df_tokenized['Entity'])

In [14]:
df_tokenized['Entity'].unique()

array(['O', 'ACTION', 'LOCATION', 'VAL', 'DEVICE', 'UNIT', 'TIME',
       'STATE', 'SERVICE', 'PREFERENCE', 'MAHDAAAAAAAAAA', 'MAHDAAAAAAAA'],
      dtype=object)

In [15]:
len(df_tokenized), df_tokenized['Word'].nunique(), df_tokenized['Entity'].nunique()

(18648505, 2297, 12)

In [16]:
df_tokenized.head()

Unnamed: 0,Sentence,Word,Entity
0,1,help,O
1,1,me,O
2,1,automatically,O
3,1,turn,ACTION
4,1,the,O


In [17]:
df_tokenized.groupby('Entity').size().reset_index(name='counts').sort_values(by='counts')

Unnamed: 0,Entity,counts
3,MAHDAAAAAAAA,6
4,MAHDAAAAAAAAAA,7
6,PREFERENCE,3836
9,TIME,64953
7,SERVICE,68128
8,STATE,212398
10,UNIT,368416
1,DEVICE,916244
11,VAL,1311821
2,LOCATION,1425964


In [18]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [36]:
df_tokenized = df_tokenized[(df_tokenized['Word'].str.len() !=0) & (df_tokenized['Word'].notnull())]

In [37]:
df_tokenized['Word'].isnull().sum()

0

In [30]:
unique_words = df_tokenized['Word'].unique()
unique_words = [w for w in unique_words if w is not None]

pos_tags={}
for word in unique_words: 
    doc= nlp(word)
    pos_tag = doc[0].pos_
    pos_tags[word] = pos_tag

In [38]:
df_tokenized['POS'] = df_tokenized['Word'].apply(lambda x: pos_tags[x] )

In [39]:
X = df_tokenized.drop('Entity', axis=1)
X.head()

Unnamed: 0,Sentence,Word,POS
0,1,help,VERB
1,1,me,PRON
2,1,automatically,ADV
3,1,turn,VERB
4,1,the,PRON


In [40]:
X.columns

Index(['Sentence', 'Word', 'POS'], dtype='object')

In [None]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

In [None]:
y = df_tokenized.Entity.values

In [None]:
classes = np.unique(y)

In [None]:
classes = classes.tolist()
classes

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [None]:
X_train.shape, y_train.shape

### Naive Bayes classifier for multinomial models

In [32]:
%timeit
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

In [40]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = classes))

              precision    recall  f1-score   support

      ACTION       0.94      1.00      0.97      3247
      DEVICE       0.93      0.96      0.95      1662
    LOCATION       0.98      0.97      0.97      2497
           O       0.99      0.98      0.99     21995
  PREFERENCE       0.20      0.08      0.12        12
     SERVICE       0.70      0.95      0.80       110
       STATE       0.59      0.96      0.73       355
        TIME       0.88      1.00      0.93       115
        UNIT       1.00      0.99      1.00       648
         VAL       0.99      0.92      0.95      2308

    accuracy                           0.98     32949
   macro avg       0.82      0.88      0.84     32949
weighted avg       0.98      0.98      0.98     32949

