In [152]:
import pandas as pd
import json
import joblib

# Load Dataset

In [153]:
with open('/content/Intent.json') as f:
    data = json.load(f)

In [154]:
df = pd.DataFrame(data['intents'])

In [155]:
df.head()

Unnamed: 0,intent,text,responses,extension,context,entityType,entities
0,Greeting,"[Hi, Hi there, Hola, Hello, Hello there, Hya, ...","[Hi human, please tell me your GeniSys user, H...","{'function': '', 'entities': False, 'responses...","{'in': '', 'out': 'GreetingUserRequest', 'clea...",,[]
1,GreetingResponse,"[My user is Adam, This is Adam, I am Adam, It ...","[Great! Hi <HUMAN>! How can I help?, Good! Hi ...","{'function': 'extensions.gHumans.updateHuman',...","{'in': 'GreetingUserRequest', 'out': '', 'clea...",,"[{'entity': 'HUMAN', 'rangeFrom': 3, 'rangeTo'..."
2,CourtesyGreeting,"[How are you?, Hi how are you?, Hello how are ...","[Hello, I am great, how are you? Please tell m...","{'function': '', 'entities': False, 'responses...","{'in': '', 'out': 'CourtesyGreetingUserRequest...",,[]
3,CourtesyGreetingResponse,"[Good thanks! My user is Adam, Good thanks! Th...","[Great! Hi <HUMAN>! How can I help?, Good! Hi ...","{'function': 'extensions.gHumans.updateHuman',...","{'in': 'GreetingUserRequest', 'out': '', 'clea...",,"[{'entity': 'HUMAN', 'rangeFrom': 5, 'rangeTo'..."
4,CurrentHumanQuery,"[What is my name?, What do you call me?, Who d...","[You are <HUMAN>! How can I help?, Your name i...",{'function': 'extensions.gHumans.getCurrentHum...,"{'in': '', 'out': 'CurrentHumanQuery', 'clear'...",,[]


In [156]:
df = df[["intent", "text"]]
df.head()

Unnamed: 0,intent,text
0,Greeting,"[Hi, Hi there, Hola, Hello, Hello there, Hya, ..."
1,GreetingResponse,"[My user is Adam, This is Adam, I am Adam, It ..."
2,CourtesyGreeting,"[How are you?, Hi how are you?, Hello how are ..."
3,CourtesyGreetingResponse,"[Good thanks! My user is Adam, Good thanks! Th..."
4,CurrentHumanQuery,"[What is my name?, What do you call me?, Who d..."


In [157]:
df = df.explode('text')
df.head()

Unnamed: 0,intent,text
0,Greeting,Hi
0,Greeting,Hi there
0,Greeting,Hola
0,Greeting,Hello
0,Greeting,Hello there


In [158]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,intent,text
0,Greeting,Hi
1,Greeting,Hi there
2,Greeting,Hola
3,Greeting,Hello
4,Greeting,Hello there


# Preprocess Text

In [159]:
import nltk
import string
import re

from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

from nltk.corpus import stopwords

nltk.download('punkt')

ps = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    text = nltk.word_tokenize(text)
    y = []

    for i in text:
      if i not in stopwords.words('english'):
        y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [160]:
df['text_cleaned'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,intent,text,text_cleaned
0,Greeting,Hi,hi
1,Greeting,Hi there,hi
2,Greeting,Hola,hola
3,Greeting,Hello,hello
4,Greeting,Hello there,hello


In [161]:
df.groupby('intent').describe()

Unnamed: 0_level_0,text,text,text,text,text_cleaned,text_cleaned,text_cleaned,text_cleaned
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
intent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Clever,7,7,You are very clever,1,7,5,clever girl,2
CourtesyGoodBye,6,6,"Thanks, bye",1,6,5,"thank , bye",2
CourtesyGreeting,7,7,How are you?,1,7,6,?,2
CourtesyGreetingResponse,8,8,Good thanks! My user is Adam,1,8,4,good thank ! adam,3
CurrentHumanQuery,7,7,What is my name?,1,7,6,think ?,2
GoodBye,4,4,Bye,1,4,4,bye,1
Gossip,6,6,I am bored gossip with me,1,6,5,tell gossip,2
Greeting,7,7,Hi,1,7,4,hi,2
GreetingResponse,8,8,My user is Adam,1,8,4,adam,3
Jokes,6,6,Tell me a joke,1,6,6,tell joke,1


# Feature Extraction

In [162]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [163]:
tf_vec = TfidfVectorizer(max_features=3000)

In [164]:
X = tf_vec.fit_transform(df['text_cleaned']).toarray()

In [165]:
Y = df['intent'].values

In [166]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Train Model

In [167]:
from sklearn.model_selection import train_test_split

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=2)

In [169]:
from sklearn.metrics import accuracy_score

In [170]:
from sklearn.ensemble import RandomForestClassifier

In [171]:
model = RandomForestClassifier()
model.fit(X_train,y_train)

In [172]:
y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.5277777777777778


In [173]:
def get_input(text):
  text = clean_text(text)
  text = pd.Series(text)
  x_pred = tf_vec.transform(text)
  return x_pred

In [174]:
class model_intent():
  def predict_intent(text):
    pred = get_input(text)
    y_pr = model.predict_proba(pred).max()
    if (y_pr < 0.4):
      print("NLU fallback: Intent could not be confidently determined")
    else:
      print(model.predict(pred))

In [175]:
filename = 'model.sav'
joblib.dump(model_intent, filename)

['model.sav']

In [176]:
loaded_model = joblib.load(filename)
result = loaded_model.predict_intent("Who are you?")

['CourtesyGreeting']
