In [1]:
import pandas as pd
import numpy as np
import pickle
import string

# Building the preprocessor 

In [2]:
from typing import *

In [3]:
import nltk
nltk.download("stopwords")
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sathyakrishnansuresh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sathyakrishnansuresh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sathyakrishnansuresh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
class TfIdfTokenizer:

    STOPWORDS = nltk.corpus.stopwords.words("english")
    WN_LEMMATIZER = nltk.stem.WordNetLemmatizer()

    def __init__(self,
                 casefold: bool=True,
                 remove_stop_words: bool=True,
                 lemmatize: bool=True) -> None:
        
        self.casefold = casefold
        self.remove_stop_words = remove_stop_words
        self.lemmatize = lemmatize
        self.vectorizer = pickle.load(open("tokenizer/tfidf.sav", "rb"))
    
    def __repr__(self) -> str:
        return f"TfIdfProcesser(casefold={self.casefold}" + \
               f",remove_stop_words={self.remove_stop_words}," + \
               f"lemmatize={self.lemmatize})"
    
    def __str__(self) -> str:
        return f"TfIdfProcesser(casefold={self.casefold}" + \
               f",remove_stop_words={self.remove_stop_words}," + \
               f"lemmatize={self.lemmatize})"

    def process(self,
                questions: Union[np.ndarray, pd.Series, pd.DataFrame],
                col_name: str=None) -> np.ndarray:
        
        if not (isinstance(questions, np.ndarray) or isinstance(questions, pd.Series) or isinstance(questions, pd.DataFrame)):
            raise TypeError(f"'questions' must be of type numpy.ndarray or pandas.Series or pandas.DataFrame. But it is of type {type(questions)}")

        if isinstance(questions, np.ndarray):
            if len(questions.shape) > 1:
                raise ValueError(f"Expected one dimensional numpy array, but got array of shape: {questions.shape}")
            else:
                self.df = pd.DataFrame(data=questions, columns=["questions"])
        elif isinstance(questions, pd.Series):
            self.df = pd.DataFrame(questions, columns=["questions"])
        else:
            if col_name is None:
                raise ValueError(f"col_name must consist of the name of the column in the dataframe cosisting the questions, should not be None")
            else:
                self.df = pd.DataFrame(questions[col_name], columns=["questions"])
        
        if self.casefold:
            self.df["questions"] = self.df["questions"].apply(lambda qn: self._casefold(qn))
        if self.remove_stop_words:
            self.df["questions"] = self.df["questions"].apply(lambda qn: self._remove_stopwords(qn))
        if self.lemmatize:
            self.df["questions"] = self.df["questions"].apply(lambda qn: self._lemmatize(qn))
        
        vector_qns = self.vectorizer.transform(self.df["questions"])
        return vector_qns


    def _remove_stopwords(self,
                          text: str) -> str:
        new_text = [word for word in text.split(' ') if word not in self.STOPWORDS]
        return ' '.join(new_text)
    
    def _lemmatize(self,
                   text: str) -> str:
        new_text = [self.WN_LEMMATIZER.lemmatize(word) for word in text.split(' ')]
        return ' '.join(new_text)
    
    def _casefold(self,
                  text: str) -> str:
        return text.lower()

In [22]:
tf = TfIdfTokenizer()
vect = tf.process(np.array(["Solve this", "Solve that given this",
                            "Given an array of size n, find sum"]))

In [23]:
vect.toarray().shape

(3, 700)

# Building the tagger

In [34]:
import os
class DSTagger:

    CLASSES = {0: "graph", 1: "array", 2: "string"}

    def __init__(self,
                 model: str='stacked') -> None:
        
        self.model_name = model
        
        if model+".sav" not in os.listdir("models/"):
            raise ValueError(f"The passed model name: {self.model_name} is not available in the list of models")

        self.model = pickle.load(open(f"models/{self.model_name}.sav", "rb"))
        self.tokenizer = TfIdfTokenizer()
    
    def __repr__(self) -> str:
        return f"DSTagger(model={self.model_name})"
    
    def __str__(self) -> str:
        return f"DSTagger(model={self.model_name})"
    
    def tag(self,
            questions: Union[np.ndarray, pd.Series, pd.DataFrame],
            col_name: str=None) -> np.ndarray:
        
        if not (isinstance(questions, np.ndarray) or isinstance(questions, pd.Series) or isinstance(questions, pd.DataFrame)):
            raise TypeError(f"'questions' must be of type numpy.ndarray or pandas.Series or pandas.DataFrame. But it is of type {type(questions)}")

        if isinstance(questions, np.ndarray):
            if len(questions.shape) > 1:
                raise ValueError(f"Expected one dimensional numpy array, but got array of shape: {questions.shape}")
            else:
                self.df = pd.DataFrame(data=questions, columns=["questions"])
        elif isinstance(questions, pd.Series):
            self.df = pd.DataFrame(questions, columns=["questions"])
        else:
            if col_name is None:
                raise ValueError(f"col_name must consist of the name of the column in the dataframe cosisting the questions, should not be None")
            else:
                self.df = pd.DataFrame(questions[col_name], columns=["questions"])
        
        self._features = self.tokenizer.process(questions)
        self._make_predictions()
        return self.pred_proba


    def int_to_label(self,
                     class_idx: int) -> str:
        return self.CLASSES[class_idx]

    def _make_predictions(self) -> None:
        self.pred_proba = self.model.predict_proba(self._features)
        for (k, v) in self.CLASSES.items():
            self.df[v] = self.pred_proba[:, k]
        pred_classes = np.argmax(self.pred_proba, axis=-1)
        self.df["ds_tag"] = pred_classes
        self.df["ds_tag"] = self.df["ds_tag"].apply(lambda class_idx: self.int_to_label(class_idx))


In [35]:
tagger = DSTagger()
tagged = tagger.tag(np.array(["Solve this", "Solve that given this",
                            "Given an array of size n, find sum"]))
tagged

array([[0.66296613, 0.02630655, 0.31072732],
       [0.66296613, 0.02630655, 0.31072732],
       [0.74204421, 0.03800439, 0.21995141]])