In [558]:
import pandas as pd
import numpy as np

In [559]:
train = pd.read_csv('train.csv', encoding='latin1')
test = pd.read_csv('test.csv', encoding='latin1')

In [560]:
print(train.shape)
print(test.shape)

(350, 3)
(5234, 2)


In [561]:
train.head()

Unnamed: 0,Id,Sentence,Category
0,0,play something from 1971 by john bonham,PlayMusic
1,1,i d like to give a two rating to the abolition...,RateBook
2,2,where can i locate the show the return of mr moto,SearchCreativeWork
3,3,is there a game called the neutral zone,SearchCreativeWork
4,4,give this textbook zero out of 6 points,RateBook


In [562]:
test.head()

Unnamed: 0,Id,Sentence
0,0,book me a table for 4 in a restaurant in unite...
1,1,add brad kane to the pumping iron soundtrack
2,2,play doctor fink if i could choose
3,3,i want to listen to an album sorted by last op...
4,4,put some mac wiseman in my latino caliente pla...


In [563]:
train.isnull().sum()/len(train)*100

Id          0.0
Sentence    0.0
Category    0.0
dtype: float64

In [564]:
test.isnull().sum()/len(train)*100

Id          0.0
Sentence    0.0
dtype: float64

In [565]:
train['Category'].value_counts()

Category
PlayMusic               50
RateBook                50
SearchCreativeWork      50
AddToPlaylist           50
GetWeather              50
BookRestaurant          50
SearchScreeningEvent    50
Name: count, dtype: int64

In [566]:
from sklearn.preprocessing import LabelEncoder

In [567]:
le = LabelEncoder()

In [568]:
train['Category'] = le.fit_transform(train['Category'])

In [569]:
train.head()

Unnamed: 0,Id,Sentence,Category
0,0,play something from 1971 by john bonham,3
1,1,i d like to give a two rating to the abolition...,4
2,2,where can i locate the show the return of mr moto,5
3,3,is there a game called the neutral zone,5
4,4,give this textbook zero out of 6 points,4


In [570]:
train['Category'].unique()

array([3, 4, 5, 0, 2, 1, 6])

In [571]:
text = train['Sentence']

In [572]:
text.head()

0              play something from 1971 by john bonham
1    i d like to give a two rating to the abolition...
2    where can i locate the show the return of mr moto
3              is there a game called the neutral zone
4              give this textbook zero out of 6 points
Name: Sentence, dtype: object

In [573]:
text.shape

(350,)

## Word Tokenization and Removing StopWords

In [574]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [575]:
text[0]

'play something from 1971 by john bonham'

In [576]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [577]:
## Pre-processing the text message

message = []
for i in range(len(text)):
    s = word_tokenize(text[i])
    temp = []  
    for j in s:
        if len(j) > 1 and j not in stop_words:  
            temp.append(j)
    t = ' '.join(temp)  
    message.append(t)  

print(message)

['play something 1971 john bonham', 'like give two rating abolition britain', 'locate show return mr moto', 'game called neutral zone', 'give textbook zero points', 'add roy rosana playlist', 'play turbulence wild streetdanz jeff buckley', 'like see picture teleform', 'going warm today saint martin port orford', 'book crepes restaurant sint maarten', 'get chillier within walking distance pw', 'add opus de funk list acoustic blues', 'red toby nemiciamici start thirteen hours', 'play song 2006', 'find attack surface analyzer painting', 'please rate current textbook stars', 'humid parc national de killarney', 'add tunnel love ethel metal crash course palylist', 'find schedule films star theatres', 'play music mark heard', 'play adrian borland music zvooq', 'weather forecast ten years mount victory macao', 'give zero stars current album', 'play music last fm', 'movie times consolidated theatres', 'show immortal grand prix', 'let play album handover deezer', 'play playlist funtime', 'rate h

In [578]:
for i in range(len(train)):
    train.loc[i,'Sentence'] = message[i]

In [579]:
train.head(4)

Unnamed: 0,Id,Sentence,Category
0,0,play something 1971 john bonham,3
1,1,like give two rating abolition britain,4
2,2,locate show return mr moto,5
3,3,game called neutral zone,5


In [580]:
train.shape

(350, 3)

In [581]:
text = train['Sentence']

## Doing Lemmitization

In [582]:
import re
from nltk.stem import WordNetLemmatizer

In [583]:
lemma  = WordNetLemmatizer()

In [584]:
corpus = []
for i in range(len(text)):
    temp = re.sub('[^a-zA-Z]',' ',text[i])
    temp = temp.lower()
    temp = temp.split()
    temp = [lemma.lemmatize(p) for p in temp if p not in set(stopwords.words('english'))]
    temp = ' '.join(temp)
    corpus.append(temp)

In [None]:
for i in range(len(train)):
    train.loc[i,'Sentence'] = corpus[i]

In [585]:
train.head(4)

Unnamed: 0,Id,Sentence,Category
0,0,play something 1971 john bonham,3
1,1,like give two rating abolition britain,4
2,2,locate show return mr moto,5
3,3,game called neutral zone,5


In [586]:
test.head(4)

Unnamed: 0,Id,Sentence
0,0,book me a table for 4 in a restaurant in unite...
1,1,add brad kane to the pumping iron soundtrack
2,2,play doctor fink if i could choose
3,3,i want to listen to an album sorted by last op...


In [587]:
print(train.shape)
print(test.shape)

(350, 3)
(5234, 2)


In [588]:
train.drop(columns=['Id'],inplace=True)
test.drop(columns=['Id'],inplace=True)

In [589]:
print(train.shape)
print(test.shape)

(350, 2)
(5234, 1)


In [590]:
train.head(3)

Unnamed: 0,Sentence,Category
0,play something 1971 john bonham,3
1,like give two rating abolition britain,4
2,locate show return mr moto,5


In [591]:
test.head(3)

Unnamed: 0,Sentence
0,book me a table for 4 in a restaurant in unite...
1,add brad kane to the pumping iron soundtrack
2,play doctor fink if i could choose


In [592]:
train.head()

Unnamed: 0,Sentence,Category
0,play something 1971 john bonham,3
1,like give two rating abolition britain,4
2,locate show return mr moto,5
3,game called neutral zone,5
4,give textbook zero points,4


In [593]:
test.head()

Unnamed: 0,Sentence
0,book me a table for 4 in a restaurant in unite...
1,add brad kane to the pumping iron soundtrack
2,play doctor fink if i could choose
3,i want to listen to an album sorted by last op...
4,put some mac wiseman in my latino caliente pla...


In [594]:
x_train , x_test , y_train , y_test = train_test_split(train['Sentence'] , train['Category'] , 
                                                       test_size=0.3 ,random_state=101)

In [595]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [596]:
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(245,)
(245,)
(105,)
(105,)


## Bag of words (bow)

In [509]:
from sklearn.feature_extraction.text import CountVectorizer

In [510]:
cv = CountVectorizer(max_features=100)
bow_x = cv.fit_transform(train['Sentence']).toarray()

In [511]:
bow_x.shape

(350, 100)

In [512]:
bow_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [513]:
train_df = pd.DataFrame(bow_x)

In [514]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [515]:
train_df['target'] = train['Category']

In [516]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,4


## Train Test Split

In [517]:
from sklearn.model_selection import train_test_split

In [518]:
x = train_df.iloc[:,:-1]

In [519]:
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [520]:
y = train_df.iloc[:,-1]

In [521]:
y.head()

0    3
1    4
2    5
3    5
4    4
Name: target, dtype: int32

In [522]:
x_train , x_test , y_train , y_test = train_test_split(x , y , 
                                                       test_size=0.3 ,random_state= 108 ,  stratify=y)

In [523]:
print(x_train.shape)
print(x_test.shape)

print(y_train.shape)
print(y_test.shape)

(245, 100)
(105, 100)
(245,)
(105,)


## Budiling Model

In [598]:
## Budiling Model
# pipeline package to merge NLP and Machine Learning together and get the output
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [662]:
rf = Pipeline([
                       ('TF-IDF', TfidfVectorizer()),
                      ('Classifier',RandomForestClassifier())
                      ])

In [663]:
rf.fit(x_train,y_train)

In [664]:
y_pred_train = rf.predict(x_train)
y_pred_test = rf.predict(x_test)

In [665]:
# Evaluation Matrix
from sklearn.metrics import confusion_matrix  , classification_report , accuracy_score

In [666]:
print(confusion_matrix(y_train,y_pred_train))
print("***********"*10)
print(confusion_matrix(y_test,y_pred_test))

[[37  0  0  0  0  0  0]
 [ 0 32  0  0  0  0  0]
 [ 0  0 38  0  0  0  0]
 [ 0  0  0 37  0  0  0]
 [ 0  0  0  0 31  0  0]
 [ 0  0  0  0  0 38  0]
 [ 0  0  0  0  0  0 32]]
**************************************************************************************************************
[[13  0  0  0  0  0  0]
 [ 0 17  0  0  0  1  0]
 [ 0  0 12  0  0  0  0]
 [ 0  0  1 12  0  0  0]
 [ 0  1  2  0 16  0  0]
 [ 0  0  0  0  0 12  0]
 [ 0  0  1  0  0  4 13]]


In [667]:
print(classification_report(y_train,y_pred_train))
print("***********"*10)
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       1.00      1.00      1.00        32
           2       1.00      1.00      1.00        38
           3       1.00      1.00      1.00        37
           4       1.00      1.00      1.00        31
           5       1.00      1.00      1.00        38
           6       1.00      1.00      1.00        32

    accuracy                           1.00       245
   macro avg       1.00      1.00      1.00       245
weighted avg       1.00      1.00      1.00       245

**************************************************************************************************************
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      0.94      0.94        18
           2       0.75      1.00      0.86        12
           3       1.00      0.92      0.96        13
           4       1.

In [668]:
print(accuracy_score(y_train,y_pred_train))
print("***********"*10)
print(accuracy_score(y_test,y_pred_test))

1.0
**************************************************************************************************************
0.9047619047619048


In [669]:
test.head()

Unnamed: 0,Sentence
0,book table restaurant united kingdom
1,add brad kane pumping iron soundtrack
2,play doctor fink could choose
3,want listen album sorted last open google musi...
4,put mac wiseman latino caliente playlist


In [670]:
text2 = test['Sentence']

In [671]:
## Pre-processing the text message

message = []
for i in range(len(text2)):
    s = word_tokenize(text2[i])
    temp = []  
    for j in s:
        if len(j) > 1 and j not in stop_words:  
            temp.append(j)
    t = ' '.join(temp)  
    message.append(t)  

print(message)



In [672]:
for i in range(len(test)):
    test.loc[i,'Sentence'] = message[i]

In [673]:
text2 = test['Sentence']

In [674]:
corpus = []
for i in range(len(text2)):
    temp = re.sub('[^a-zA-Z]',' ',text2[i])
    temp = temp.lower()
    temp = temp.split()
    temp = [lemma.lemmatize(p) for p in temp if p not in set(stopwords.words('english'))]
    temp = ' '.join(temp)
    corpus.append(temp)

In [675]:
for i in range(len(test)):
    test.loc[i,'Sentence'] = corpus[i]

In [676]:
test.head()

Unnamed: 0,Sentence
0,book table restaurant united kingdom
1,add brad kane pumping iron soundtrack
2,play doctor fink could choose
3,want listen album sorted last open google musi...
4,put mac wiseman latino caliente playlist


In [677]:
test.shape

(5234, 1)

In [678]:
bow_test = cv.fit_transform(test['Sentence']).toarray()

In [679]:
bow_test

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [680]:
bow_test.shape

(5234, 100)

In [681]:
test.head()

Unnamed: 0,Sentence
0,book table restaurant united kingdom
1,add brad kane pumping iron soundtrack
2,play doctor fink could choose
3,want listen album sorted last open google musi...
4,put mac wiseman latino caliente playlist


In [682]:
y_pred_test_submission = rf.predict(test['Sentence'])

In [683]:
y_pred_test_submission

array([1, 0, 3, ..., 1, 1, 2])

In [684]:
y_pred_original = le.inverse_transform(y_pred_test_submission)

In [685]:
y_pred_original

array(['BookRestaurant', 'AddToPlaylist', 'PlayMusic', ...,
       'BookRestaurant', 'BookRestaurant', 'GetWeather'], dtype=object)

In [686]:
y_pred_original.shape

(5234,)

In [687]:
test.head()

Unnamed: 0,Sentence
0,book table restaurant united kingdom
1,add brad kane pumping iron soundtrack
2,play doctor fink could choose
3,want listen album sorted last open google musi...
4,put mac wiseman latino caliente playlist


In [688]:
test.shape

(5234, 1)

In [689]:
submission = pd.read_csv('text_classfication_submission_sample.csv')

In [690]:
submission.head()

Unnamed: 0,Id,Category
0,0,BookRestaurant
1,1,BookRestaurant
2,2,BookRestaurant
3,3,BookRestaurant
4,4,BookRestaurant


In [691]:
submission.shape

(5234, 2)

In [692]:
for i in range(len(submission)):
    submission.loc[i , 'Category'] = y_pred_original[i]

In [693]:
submission.head()

Unnamed: 0,Id,Category
0,0,BookRestaurant
1,1,AddToPlaylist
2,2,PlayMusic
3,3,PlayMusic
4,4,AddToPlaylist


In [694]:
submission.to_csv('submission_tfidf_rf.csv', index=False)  # 'index=False' prevents saving the index column
