In [1]:
import pickle
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import punkt
from nltk.corpus.reader import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
path_models = "Models/"

# SVM
path_svm = path_models + 'best_lrc.pickle'
with open(path_svm, 'rb') as data:
    svc_model = pickle.load(data)

In [50]:
path_tfidf = "D:/Projects/Blooms/Feature Engineering/Pickles/tfidf.pickle"
with open(path_tfidf, 'rb') as data:
    tfidf = pickle.load(data)

In [51]:
category_codes = {
    'BT1': 0,
    'BT2': 1,
    'BT3': 2,
    'BT4': 3,
    'BT5': 4,
    'BT6': 5
}

In [52]:
punctuation_signs = list("?:!.,;")
stop_words = list(stopwords.words('english'))

def create_features_from_text(text):
    
    # Dataframe creation
    lemmatized_text_list = []
    df = pd.DataFrame(columns=['Questions'])
    df.loc[0] = text
    df['Questions_Parsed_1'] = df['Questions'].str.replace("\r", " ")
    df['Questions_Parsed_1'] = df['Questions_Parsed_1'].str.replace("\n", " ")
    df['Questions_Parsed_1'] = df['Questions_Parsed_1'].str.replace("    ", " ")
    df['Questions_Parsed_1'] = df['Questions_Parsed_1'].str.replace('"', '')
    df['Questions_Parsed_2'] = df['Questions_Parsed_1'].str.lower()
    df['Questions_Parsed_3'] = df['Questions_Parsed_2']
    for punct_sign in punctuation_signs:
        df['Questions_Parsed_3'] = df['Questions_Parsed_3'].str.replace(punct_sign, '')
    df['Questions_Parsed_4'] = df['Questions_Parsed_3'].str.replace("'s", "")
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_list = []
    text = df.loc[0]['Questions_Parsed_4']
    text_words = text.split(" ")
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
    lemmatized_text = " ".join(lemmatized_list)    
    lemmatized_text_list.append(lemmatized_text)
    df['Questions_Parsed_5'] = lemmatized_text_list
    df['Questions_Parsed_6'] = df['Questions_Parsed_5']
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        df['Questions_Parsed_6'] = df['Questions_Parsed_6'].str.replace(regex_stopword, '')
    df = df['Questions_Parsed_6']
    df = df.rename({'Questions_Parsed_6': 'Questions_Parsed'})
    
    # TF-IDF
    features = tfidf.transform(df).toarray()
    
    return features

In [53]:
def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

In [54]:
def predict_from_text(text):
    
    # Predict using the input model
    prediction_svc = svc_model.predict(create_features_from_text(text))[0]
    prediction_svc_proba = svc_model.predict_proba(create_features_from_text(text))[0]
    
    # Return result
    category_svc = get_category_name(prediction_svc)
    
    print("The predicted category using the SVM model is %s." %(category_svc) )
    print("The conditional probability is: %a" %(prediction_svc_proba.max()*100))

In [55]:
text = """what is your name"""

In [56]:
predict_from_text(text)

The predicted category using the SVM model is BT1.
The conditional probability is: 70.68340198186802
