In [129]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
import matplotlib.pyplot as plt

%matplotlib inline

# Rename the columns to match below incase they are not already

In [98]:
df = pd.read_csv("sample_data2.csv")
df.columns

Index(['category1', 'category2', 'query'], dtype='object')

In [135]:
df.head()

Unnamed: 0,category1,category2,query
0,abc,aaa,I was there at the place yesterday afternoon! ...
1,abc,aaa,i loved their concept of junction written by f...
2,abc,aaa,Junction located near Huaz Khas is one of the ...
3,abc,aaa,"We tried Chicken Seekh Kebab, Garlic Bread and..."
4,abc,bbb,This new property in Delhi is actually a junct...


# Combines the 2 categories into 1 to create a new y variable

In [100]:
#Combines the 2 categories into 1 to create a new y variable

def combine_columns(a):
    a['category3'] = a['category1'] +'__'+ a['category2']
    b = a[['category3','query']].dropna()
    return b

In [84]:
df2 = combine_columns(df)
len(df2)

74

# Filter out the categories that have less than N training samples

In [85]:
print(df2['category3'].value_counts())    #Check these values

N = 3  #Change this to a good number based on above values

nop__jjj    9
klm__ggg    8
abc__aaa    8
efg__bbb    6
qrs__kkk    6
hij__eee    6
abc__bbb    6
klm__hhh    5
efg__ddd    4
klm__iii    4
efg__ccc    4
hij__ddd    4
hij__fff    2
nop__iii    1
nop__kkk    1
Name: category3, dtype: int64


In [86]:
def remove_lowsamples(a):
    cnt_dict = dict(a['category3'].value_counts())
    good_categories = [k for k,v in cnt_dict.items() if v > N]
    b = a.loc[a['category3'].isin(good_categories)]
    return b

df3 = remove_lowsamples(df2)
len(df3)

70

# Prepping the training data

In [126]:
X = df3['query']
y = df3['category3']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

# Pipeline and model training

In [127]:
tfidf = TfidfVectorizer(stop_words='english')
svm = LinearSVC()
pipe = Pipeline([('tfidf',tfidf),('svm',svm)])

pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

# Predictions & accuracy

In [128]:
pipe.score(X_test, y_test)

0.14285714285714285

# Dumping the model

In [131]:
joblib.dump(pipe,'hrquery_model_v1.pkl')

['hrquery_model_v1.pkl']

In [134]:
loaded_model = joblib.load('hrquery_model_v1.pkl')
loaded_model

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])