In [85]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer   
from sklearn.preprocessing import LabelEncoder


In [86]:
df=pd.read_json('data.json')

In [87]:
tags=[x['tag'] for x in df['intents']]

In [88]:
tags

['greeting',
 'goodbye',
 'thanks',
 'product_info',
 'product_types',
 'product_size',
 'price_range',
 'offers',
 'order_status',
 'return_policy',
 'delivery',
 'store_location',
 'small_talk',
 'joke',
 'brand_story',
 'contact_support',
 'name']

In [89]:
df1=pd.read_csv('data1.csv', on_bad_lines='skip')

In [90]:
df1

Unnamed: 0,text,label
0,Hi,greeting
1,Hello,greeting
2,Hey,greeting
3,Good morning,greeting
4,Good evening,greeting
...,...,...
64,I have an issue,contact_support
65,Support please,contact_support
66,What is your name,name
67,Who are you,name


In [91]:
text=df1['text'].copy()

In [92]:
text

0                    Hi
1                 Hello
2                   Hey
3          Good morning
4          Good evening
            ...        
64      I have an issue
65       Support please
66    What is your name
67          Who are you
68     Your name please
Name: text, Length: 69, dtype: object

In [93]:
def cleaner(text):
    ps = PorterStemmer()
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [ps.stem(word) for word in text if word.isalpha()]
    return ' '.join(text)

In [94]:
text=text.apply(cleaner)

In [95]:
text

0                    hi
1                 hello
2                   hey
3             good morn
4             good even
            ...        
64       i have an issu
65        support pleas
66    what is your name
67          who are you
68      your name pleas
Name: text, Length: 69, dtype: object

In [96]:
cv=CountVectorizer(ngram_range=(1,2))

In [97]:
vec = cv.fit_transform(text)

In [98]:
cv.vocabulary_

{'hi': 73,
 'hello': 70,
 'hey': 72,
 'good': 57,
 'morn': 103,
 'good morn': 59,
 'even': 47,
 'good even': 58,
 'is': 80,
 'anyon': 12,
 'there': 181,
 'is anyon': 81,
 'anyon there': 13,
 'bye': 21,
 'see': 146,
 'you': 220,
 'see you': 147,
 'talk': 169,
 'to': 182,
 'later': 91,
 'talk to': 170,
 'to you': 185,
 'you later': 223,
 'goodby': 60,
 'catch': 26,
 'catch you': 27,
 'thank': 173,
 'thank you': 174,
 'that': 175,
 'help': 71,
 'that help': 176,
 'so': 156,
 'much': 104,
 'you so': 226,
 'so much': 157,
 'appreci': 14,
 'it': 86,
 'appreci it': 15,
 'tell': 171,
 'me': 98,
 'about': 0,
 'radem': 135,
 'pant': 123,
 'tell me': 172,
 'me about': 99,
 'about radem': 3,
 'radem pant': 136,
 'what': 197,
 'are': 16,
 'what are': 198,
 'are radem': 17,
 'make': 95,
 'your': 227,
 'special': 160,
 'what make': 201,
 'make your': 97,
 'your pant': 229,
 'pant special': 126,
 'whi': 211,
 'should': 151,
 'choos': 28,
 'whi should': 212,
 'should choos': 152,
 'choos radem': 29,
 '

In [99]:
vec.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [100]:
le=LabelEncoder()

In [101]:
label=df1['label'].copy()

y=le.fit_transform(label)

In [102]:
y

array([ 4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3, 16, 16, 16, 16, 16, 10,
       10, 10, 10, 12, 12, 12, 12, 11, 11, 11, 11,  9,  9,  9,  9,  7,  7,
        7,  7,  8,  8,  8,  8, 13, 13, 13,  2,  2,  2,  2, 15, 15, 15, 15,
       14, 14, 14, 14,  5,  5,  5,  0,  0,  0,  0,  1,  1,  1,  1,  6,  6,
        6])

In [103]:
from sklearn.ensemble import RandomForestClassifier

In [104]:
modelR=RandomForestClassifier()

In [105]:
modelR.fit(vec.toarray(), y)

In [106]:
modelR.predict(vec.toarray()[0].reshape(1, -1))

array([4])

In [107]:
vec.toarray().shape

(69, 234)

In [108]:
from sklearn.pipeline import make_pipeline

In [109]:
from sklearn.pipeline import Pipeline

pipelines = Pipeline([
    ('cleaner', cleaner),
    ('cv', cv),
    ('modelR', modelR)
])

In [110]:
import pickle

with open('model1.pkl', 'wb') as file:
    pickle.dump(pipelines, file)

In [111]:
le.classes_

array(['brand_story', 'contact_support', 'delivery', 'goodbye',
       'greeting', 'joke', 'name', 'offers', 'order_status',
       'price_range', 'product_info', 'product_size', 'product_types',
       'return_policy', 'small_talk', 'store_location', 'thanks'],
      dtype=object)

In [112]:
le.classes_[y[0]]

'greeting'

In [113]:
with open('indexes.pkl', 'wb') as file:
    pickle.dump(le.classes_, file)

In [114]:
with open('vocab.pkl', 'wb') as file:
    pickle.dump(cv.vocabulary_,file)