In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import re
import time
import gc

import nltk

In [2]:
import pickle
from sklearn.model_selection import GridSearchCV

In [3]:
# load category model
filename = 'Question_Classification_LinearSVM_model.pkl'
tuned_category_model = pickle.load(open(filename, 'rb'))

In [4]:
# load topic model
topic_filename = 'Question_Classification_LinearSVM_topic_model.pkl'
tuned_topic_model = pickle.load(open(filename, 'rb'))

In [5]:
# load vectorizer

vector_filename = 'Question_Classification_vectorizer.pkl'
vectorizer = pickle.load(open(vector_filename, 'rb'))

In [6]:
# load categories and class names

from ast import literal_eval as le

categories, topics = {},{}

with open('category_labels.txt','r') as f:
    categories = le(f.read())

with open('topic_labels.txt','r') as f:
    topics = le(f.read())

In [7]:
categories, topics

({'ABBR': 2, 'DESC': 0, 'ENTY': 1, 'HUM': 3, 'LOC': 5, 'NUM': 4},
 {'abb': 34,
  'animal': 2,
  'body': 22,
  'city': 21,
  'code': 39,
  'color': 19,
  'count': 13,
  'country': 18,
  'cremat': 1,
  'currency': 46,
  'date': 8,
  'def': 7,
  'desc': 12,
  'dismed': 23,
  'dist': 40,
  'event': 10,
  'exp': 3,
  'food': 17,
  'gr': 5,
  'ind': 4,
  'instru': 33,
  'lang': 37,
  'letter': 15,
  'manner': 0,
  'money': 25,
  'mount': 24,
  'ord': 43,
  'other': 14,
  'perc': 38,
  'period': 27,
  'plant': 30,
  'product': 26,
  'reason': 9,
  'religion': 16,
  'speed': 35,
  'sport': 29,
  'state': 11,
  'substance': 28,
  'symbol': 42,
  'techmeth': 31,
  'temp': 41,
  'termeq': 20,
  'title': 6,
  'veh': 44,
  'volsize': 32,
  'weight': 45,
  'word': 36})

In [8]:
def lookup(dictionary, value):
    
    """
        Get the key of a particular value in a dict.
        Input - Dictionary to map , Type : <dict>
        Output - key for the given value , Type : <str>
    """
    
    for k,v in dictionary.items():
        if v == value:
            return k
    
    return 'Not Found'

In [9]:
lookup(categories, 0)

'DESC'

In [21]:
def predict(text, category_model = tuned_category_model,
            topic_model = tuned_topic_model,
            vectorizer = vectorizer,
            categories = categories,
            topics = topics):
    
    #convert text to lower
    text = text.lower()
    
    #form feature vectors
    features = vectorizer.transform([text])
    
    #predict result category
    print('Using best category model : {}'.format(category_model))
    pred = category_model.predict(features)
    
    category = lookup(categories, pred[0])
    print('Category : {}'.format(category))
    
    #predict result topic
    print('\n\nUsing best topic model : {}'.format(topic_model))
    pred = topic_model.predict(features)
    
    topic = lookup(topics, pred[0])
    print('Topic : {}'.format(topic))
    
    return category, topic, category_model, topic_model

In [22]:
predict('Who is Aamir Syed?')

Using best category model : GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=3)
Category : HUM


Using best topic model : GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
               

('HUM', 'exp', GridSearchCV(cv=None, error_score=nan,
              estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                  fit_intercept=True, intercept_scaling=1,
                                  loss='squared_hinge', max_iter=1000,
                                  multi_class='ovr', penalty='l2',
                                  random_state=None, tol=0.0001, verbose=0),
              iid='deprecated', n_jobs=None,
              param_grid={'C': [0.1, 1, 10, 100, 1000]}, pre_dispatch='2*n_jobs',
              refit=True, return_train_score=False, scoring=None, verbose=3), GridSearchCV(cv=None, error_score=nan,
              estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                  fit_intercept=True, intercept_scaling=1,
                                  loss='squared_hinge', max_iter=1000,
                                  multi_class='ovr', penalty='l2',
                                  random_state=None, tol=