In [19]:
import pandas as pd
df = pd.read_csv('intent_data.csv')

In [20]:
from io import StringIO
df.columns = ['class', 'question']
df = df[pd.notnull(df['question'])]
df.head()

Unnamed: 0,class,question
0,Description,What is Filename injection Path traversal ?
1,Description,What does Filename injection Path traversal m...
2,Description,Tell me something about Filename injection Pa...
3,Description,Filename injection Path traversal
4,Description,Explain Filename injection Path traversal ?


In [21]:
df['class'].factorize()[0]

array([0, 0, 0, ..., 2, 2, 2], dtype=int64)

In [22]:
df['category_id'] = df['class'].factorize()[0]

In [23]:
category_id_df = df[['class', 'category_id']].drop_duplicates().sort_values('category_id')

In [24]:
category_to_id = dict(category_id_df.values)
category_to_id

{'Code': 2, 'Description': 0, 'Solution': 1}

In [25]:
id_to_category = dict(category_id_df[['category_id', 'class']].values)
id_to_category

{0: 'Description', 1: 'Solution', 2: 'Code'}

In [26]:
df.head()

Unnamed: 0,class,question,category_id
0,Description,What is Filename injection Path traversal ?,0
1,Description,What does Filename injection Path traversal m...,0
2,Description,Tell me something about Filename injection Pa...,0
3,Description,Filename injection Path traversal,0
4,Description,Explain Filename injection Path traversal ?,0


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.question).toarray()
labels = df.category_id
#features.tolist()
#labels.values

In [35]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2
for classs, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(classs))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'Code':
  . Most correlated unigrams:
. example
. code
  . Most correlated bigrams:
. sample code
. code example
# 'Description':
  . Most correlated unigrams:
. know
. tell
  . Most correlated bigrams:
. want know
. know xss
# 'Solution':
  . Most correlated unigrams:
. tell
. solution
  . Most correlated bigrams:
. code example
. provide solution


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [40]:
X_train, X_test, y_train, y_test = train_test_split(df['question'], df['class'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score

In [42]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

In [49]:
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df



Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.820418
1,RandomForestClassifier,1,0.790488
2,RandomForestClassifier,2,0.785978
3,RandomForestClassifier,3,0.831419
4,RandomForestClassifier,4,0.83094
5,LinearSVC,0,1.0
6,LinearSVC,1,1.0
7,LinearSVC,2,1.0
8,LinearSVC,3,1.0
9,LinearSVC,4,1.0


In [50]:
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC                 1.000000
LogisticRegression        0.999672
MultinomialNB             0.978921
RandomForestClassifier    0.811849
Name: accuracy, dtype: float64

In [51]:
model = LinearSVC()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [61]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, target_names=df['class'].unique()))

              precision    recall  f1-score   support

 Description       1.00      1.00      1.00      1836
    Solution       1.00      1.00      1.00      1719
        Code       1.00      1.00      1.00       469

   micro avg       1.00      1.00      1.00      4024
   macro avg       1.00      1.00      1.00      4024
weighted avg       1.00      1.00      1.00      4024

