In [23]:
import numpy as np
import seaborn as sns
import pandas as pd 
import pickle
import os 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [26]:
class Train_Diagnosis():
    '''
    This class is actually used to train and update the model and the data that I have 
    Prediction is going to be done through a function.
    
    Class to predict the ailment based on the textual information provided 
    on the site 
    ATTRIBUTES:
    data : the processed and lemmatized dataframe loaded from the memory (
    contains stemmed_data column and the prompt column)
    ailments_dict_keyname/_keyint : dictionaries representing the unique ailments present 
                   in the data set 
    vectorizer : TfIdf vectorizer used for transforming data 
    ( based on the latest state of the data file)
    model : 
    '''
    def __init__(self):
        self.data = self.get_latest_data()
        self.ailments_dict_keyname = self.get_ailments(0)
        self.ailments_dict_keyint = self.get_ailments(1)
        self.vectorizer = self.get_vectorizer()
        self.model = self.get_model()
        self.vector_path = 'models/vectorizer.pkl'
        self.model_path = 'models/model.pkl'
        
    def get_latest_data(self):
        '''Function to load in the latest dataframe 
        that you have for the model training '''
        data = pd.read_csv(r'data/trial_data.csv')
        return data
    
    def get_ailments(self,type_of_dict):
        '''Function to load the unique ailment dictionary
        PARAMETERS: type_of_dict: 0,1 : how to form the 
                     keys of the dictionary 
        RETURNS : dictionary of the ailments'''
        D = {}
        ailments = self.data['Prompt'].unique()
        if(type_of_dict == 0):
            # By name
            for i,k in enumerate(ailments):
                D[k] = i
        else:
            # By indexing
            for i,k in enumerate(ailments):
                D[i] = k
        return D
    
    def get_vectorizer(self):
        '''Return a vectorizer to fit on the data '''
        TfIdf = TfidfVectorizer(stop_words = 'english', ngram_range= (1,3),max_df= 0.7)
        return TfIdf
    
    def get_training_x(self):
        '''Returns the transformed data for training'''
        X = (self.vectorizer).fit_transform(self.data['stemmed_phrase'])
        X = X.toarray()
        # update the vectorizer here 
        self.vectorizer = self.vectorizer.fit(self.data['stemmed_phrase'])
        # save the vectorizer at this point, after you have fit it 
        pickle.dump(self.vectorizer,open(self.vector_path,'wb'))
        return X 
    
    def get_training_y(self):
        '''Returns the encoded classes for training'''
        Y = self.data['Prompt'].map(self.ailments_dict_keyname)
        return Y
    
    def get_model(self):
        '''Returns a model for the data '''
        M = RandomForestClassifier(n_estimators=36,min_samples_leaf=2)
        return M        
    # METHODS 
    # 1. Trains the model 
    
    def train_model(self):
        '''Trains the model as and when you want 
        with the loaded data'''
        X = self.get_training_x()
        Y = self.get_training_y()
        # validation is actually done on the query not the test data
        self.model.fit(X,Y)
        pickle.dump(self.model,open(self.model_path,'wb'))

        # that's it

In [30]:
class Predictions():
    '''class to make the predictions given the model and then 
    append the query to the data set that you currently have '''
    def __init__(self,model,data_path):
        self.model = model
        self.data = pd.read_csv(data_path)
        self.stemmer = SnowballStemmer('english')
        punctuation='["\'?,\.]'
        self.abbr_dict = {
            "what's":"what is",
            "what're":"what are",
            "where's":"where is",
            "where're":"where are",
            "i'm":"i am",
            "we're":"we are",
            "it's":"it is",
            "that's":"that is",
            "there's":"there is",
            "there're":"there are",
            "i've":"i have",
            "who've":"who have",
            "would've":"would have",
            "not've":"not have",
            "i'll":"i will",
            "it'll":"it will",
            "isn't":"is not",
            "wasn't":"was not",
            "aren't":"are not",
            "weren't":"were not",
            "can't":"can not",
            "couldn't":"could not",
            "don't":"do not",
            "didn't":"did not",
            "shouldn't":"should not",
            "wouldn't":"would not",
            "doesn't":"does not",
            "haven't":"have not",
            "hasn't":"has not",
            "hadn't":"had not",
            "won't":"will not",
            punctuation:'',
            '\s+':' ', # replace multi space with one single space
        }
    def process_query(self,query):
        
        '''Returns a processed and stemmed query'''
        query = query.lower()
        res = ''
        for k in query.split():
            if k in self.abbr_dict:
                res+=' ' + self.abbr_dict[k]
            else:
                res+=' ' + k 
        
        res = ' '.join([self.stemmer.stem(y) for y in res.split()])
        return res 
    
    def append_query(self,query,ailment):
        '''Take the query and prediction and then append it to original data '''
        
        col1 = 'stemmed_phrase'
        col2 = 'Phrase'
        self.data.append([{col1 : query , col2 : ailment}] , ignore_index = True)
        

- Note that the whole dataset does not need to be stemmed again and again
- I will store the stemmed data and then as and when I get a query I would append the stemmed query at the end of the data set after each prediction

In [28]:
trainer = Train_Diagnosis()
trainer.train_model()

In [31]:
# get predictions needs to be a separate function as it just needs to get the 
# predictions 
from flask import Flask, request, jsonify
from flask_cors import CORS 

## 1. prediction 
## moreover, model and vectorizer need not be loaded again and again

# build 1 end point
app = Flask(__name__)
CORS(app)


def load_from_pickle(file):
    loaded = pickle.load(open(file,'rb'))
    return loaded

# got the models     
vectorizer = load_from_pickle('models/vectorizer.pkl')
model = load_from_pickle('models/model.pkl')

trainer = Train_Diagnosis()
ailments = trainer.get_ailments(1)
diagnoser = Predictions(model,'data/trial_data.csv')
# parameter 1 -> 
    # processes the query given by the site and make prediction   
    
@app.route('/process',methods = ['GET'])
def get_diagnosis():
    q = request.args.get('query')
    processed = diagnoser.process_query(q)
    
    # now transform
    query = [processed]
    query = vectorizer.transform(query)
    
    # and predict 
    preds = model.predict_proba(query)
    res = list(np.argsort(preds))[0]
    res = res[::-1][:3] # top 3 
    ailment_top = ailments[res[0]]
    
    # append record to the data
    diagnoser.append_query(query,ailment_top)
    
    #gather predictions 
    predictions = []
    for k in res: 
        predictions.append(ailments[k])

# parameter point 2 -> 
    # re-trains the model on the acquired data and 
    # reloads model and vectorizer 
    
    # 0 -> do not train 
    # 1 -> train again 
    train = int(request.args.get('train')) # ->
    if(train is not None and train == 1):
        # means I need to train along with this query 
        trainer.train_model()
        #at this point load the vectorizer and model again with the new queries
        vectorizer = load_from_pickle('models/vectorizer.pkl')
        model = load_from_pickle('models/model.pkl')
    
    return jsonify(predictions)


if __name__=='__main__':
    app.run(port = 5000, debug = True)

NameError: name 'punctuation' is not defined