# Example Pipeline

an example of the way we could organize the code, so that each of us work on 
a certain class, and so that we might evaluate with the same metrics

It is purely for demonstrative purposes, and open to suggestion

---

In [1]:
import re
import sys


class Baseline_clean:
    """A toy cleaning method for demo
    
    
    """
    def __init__(self, corpus):
        self.corpus = corpus
        self.stop = ['the', 'i', 'is', 'am', 'are','linguistic ressourcesa', 'but', 'how', 'what']
    
    
    def clean_text(self, text):
        """A toy clean method using only regex and string libraries
        
        :return:
            text cleaned
        """
        stop = self.stop
        text = ' '.join(text.split('.'))
        # take out any non alphabetic characters 
        text = re.sub('[^a-zA-Z ]', '', text)
        # remove more than 2 rep of letters (not removing double letters because it's important)
        text = re.sub("(.)\\1{2,}", "\\1", text)
        # tokenization via split
        toks = text.split()
        # removal of stop words 
        toks = [tok for tok in toks if tok not in stop]
        return ' '.join(toks)
    
    
    def clean_corpus(self):
        """Takes a list of strings of text and returns them cleaned after processing
        
        :return:
            corpus cleaned
        """
        return [self.clean_text(x) for x in self.corpus]
        

In [2]:
def train_test_split(recommender_data):
    """Method to split data, for now just splicing list
    
    Should be worked on for our specific purposes
    
    """
    train = recommender_data[6:]
    X_train = train['text']
    y_train = train['parent_id']
    test = recommender_data[:6]
    X_test = test['text']
    y_test = test['parent_id']
    return X_train, X_test, y_train, y_test

In [3]:
import numpy as np
import sklearn

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

class Tfidf:
    """The class that returns the Tfidf model with adjusted parameters
    
    """
    def __init__(self, X_train):
        self.X_train = X_train
    
    
    def model(self):
        """Functionality which defines model and parameters
        """
        
        return TfidfVectorizer() #to define parameters
    
    
    def get_vector_space(self):
        """Train model and return document vector space
        """
        model = self.model()
        vector_space = model.fit_transform(self.X_train)
        return model, vector_space.todense()

In [5]:
import sklearn 
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 
from scipy.stats import pearsonr

class HumanEval:
    """Generalized method to evaluate models comparitavely
    No need to index training data or be careful about datasplit
    
    :param model: the model to be evaluated
    :param human_eval_data: the dataframe for the data used to evaluate model
    """
    def __init__(self, model, human_eval_data):
        self.model = model
        self.human_eval_data = human_eval_data

    
    def open_human_eval(self):
        '''Open the small human evaluation dataset
        
        In real application, this dataset is comparing Protus 
        text to QJ texts with a binary groundTruth value 
        which has been manually validated
        
        :return:
            a list of list of document pairs
            the index to string dictionary
        '''
        X_test = []
        y_true = []
        for i, r in self.human_eval_data.iterrows():
            X_test.append(r['text_1'])
            X_test.append(r['text_2'])
            y_true.append(r['evaluation'])
        return X_test, y_true
        
        
    def predict(self):
        '''Predict method using cosine similarity
        '''
        X_test, y_true = self.open_human_eval()
        vectorized_test = self.transform_test(X_test)
        pairs = zip(*[iter(vectorized_test)]*2)
        y_pred = []
        for text_1, text_2 in pairs:
            y_pred.append(cosine_similarity(text_1, text_2)[0][0])
        return y_true, y_pred
        
    
    def transform_test(self, X_test):
        '''Transforms the test using the tfidf model
        '''
        if type(self.model) == sklearn.feature_extraction.text.TfidfVectorizer:
            return self.model.transform(X_test).todense()
    
    
    def pearsons_coeff(self, y_pred, y_true):
        '''Returns the pearsons coefficient score
        '''
        return pearsonr(y_pred, y_true)
        
        
    def print_eval(self):
        """Prints the eval scores of the human evaluation 
        """
        y_true, y_pred = self.predict()
        pearsons_scores = self.pearsons_coeff(np.array(y_pred), np.array(y_true))
        print('y_pred: ', y_pred)
        print('y_true: ', y_true)
        print('pearsons correlation coeff: ', pearsons_scores[0])
        print('pearsons p-value: ', pearsons_scores[1])
        
        


In [6]:
from sklearn.neighbors import NearestNeighbors

class RecommenderEval:
    """Method to evaluate recommendation system
    
    :param model: the model to be evaluated
    :param vector_space: the vector space after training of model
    :param X_train: the train data, list of strings
    :param X_test: the test data, list of strings
    :param y_train: the train labels, list of integers
    :param y_test: the test labels, list of integers
    :param k: the number of recommendations wanted
    
    """
    def __init__(self, model, vector_space, X_train, X_test, y_train, y_test, k = 3):
        self.model = model
        self.vector_space = vector_space
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.k = k
    
    
    def transform(self):
        '''General method for test transform

        :return:
            the transformed test set
        '''
#         print(self.X_test)
        if type(self.model) == sklearn.feature_extraction.text.TfidfVectorizer:
            return self.model.transform(self.X_test).todense()       
    
    
    def predict(self):
        """Prediction in the case of a recommender system is a nearest neighbor search
        
        :return:
            distances from the unseen doc vector to the top k nearest doc vectors
            indices from the training data of the top k nearest doc vectors
        
        """
        test_vector_space = self.transform()
        nbrs = NearestNeighbors(n_neighbors=self.k, algorithm = 'brute').fit(self.vector_space)
        distances, indices = nbrs.kneighbors(test_vector_space)
        return distances, indices
    
    
    def number_recommended_relevant(self, index, y_pred, similar_items):
        """Get the number of recommended items which are relevant by checking in similar items list
        
        The method to get number of relevant items is specific to the task that we are doing, 
        but the eval metric itself is general
        
        :return: 
            number of relevent items of a single prediction of k items
        """
        # for the dummy data, what is sort of like our report_id corresponds to index of test data +1, 
        # since the test is the first 6 elements, indexed 1-23, but this will be very different
        test_id = index
        number_relevant = 0
        for x in y_pred:
            if similar_items.values():
                for v in similar_items.values():
                    if test_id in v and x in v:
                        number_relevant += 1
        return number_relevant
    
    
    def get_similar(self):
        """Regroup documents in train which have similar labels
        
        :return:
            dictionary whose keys are the parent id and values are the
            indices of the documents of the train which have this same parent id
        """
        similar_items = {}
        for index, parent_id in enumerate(self.y_train):
            if parent_id not in similar_items:
                similar_items[parent_id] = [index]
            else:
                similar_items[parent_id].append(index)
        return similar_items
    
    
    def precision_at_k(self, index, y_pred, similar_items):
        """precision at K is the eval metric used in the previous POC, from what I understand
        
        :return:
            the precision at k for a single recommendation
        """
        number_relevant = self.number_recommended_relevant(index, y_pred, similar_items)
        return number_relevant/self.k
    
    
    def mean_average_precision(self):
        pass
    
    
    def mean_reciprocal_rank(self):
        pass
    
    
    def print_scores(self):
        """Prints the eval scores of the recommendation system
        
        """
        precs_at_k = []
        similar_items = self.get_similar()
        distances, indices = self.predict()
        # get the list of precision at k for all recommendations
        for index, (distance, indice) in enumerate(zip(distances, indices)):
            precs_at_k.append(self.precision_at_k(index, indice, similar_items))
        print('the precision at ', self.k, ' score : ', np.mean(precs_at_k))
        
    def save_scores(self):
        pass

In [7]:
# simulation of training and test data for the human evaluation set that we have for Protus2QJ
import pandas as pd
TRAINING_DATA = [
        'I have a dog', 
        'I have a cat',
        'My kitten likes dog food', 
        'Do you have a dog?', 
        'Do you have a cat?', 
        'Last week I went to the pet store', 
        'My feline really does not like the dog', 
        'My cat is eating cat food', 
        'I bought cat food at the pet store', 
        'Dogs do not like cat food',
        'My cat loves dog food', 
        'The cat does not like the dog',
        'My cat is fat',
        'My dog is hungry', 
        'My dog loves cats'
    ]
df_training_data = pd.DataFrame(data = [{"Text":sentence} for sentence in TRAINING_DATA])
df_training_data

human_eval = [
    {"text_1": "Do you happen to hava a feline?", "text_2": "Does your dog have a cat?", "evaluation": 0},
    {"text_1": "My dog loves a cat", "text_2": "My dog likes cats", "evaluation": 1},
    {"text_1": "Dogs love cat food", "text_2": "Dogs love dog food", "evaluation": 0},
    {"text_1": "My cat likes a dog", "text_2": "The cat loves a dog", "evaluation": 1}

]
HUMAN_EVAL_DF = pd.DataFrame(data = human_eval)
HUMAN_EVAL_DF


Unnamed: 0,text_1,text_2,evaluation
0,Do you happen to hava a feline?,Does your dog have a cat?,0
1,My dog loves a cat,My dog likes cats,1
2,Dogs love cat food,Dogs love dog food,0
3,My cat likes a dog,The cat loves a dog,1


In [8]:
# simulation of data to be split into train test
recommend_data = [
    {'id': 1, 'text': 'I have a dog', 'parent_id' : 1},
    {'id': 2, 'text': 'I have a cat', 'parent_id' : 2},
    {'id': 3, 'text': 'Last week I went to the pet store', 'parent_id' : 3},
    {'id': 4, 'text': 'My kitten likes dog food', 'parent_id' : 4},
    {'id': 5, 'text': 'My dog is hungry', 'parent_id' : 5},
    {'id': 6, 'text': 'My feline really does not like the dog', 'parent_id' : 6},
    {'id': 7, 'text': 'Do you have a dog?', 'parent_id' : 1},
    {'id': 8, 'text': 'Do you have a cat?', 'parent_id' : 2},
    {'id': 9, 'text': 'My cat is eating cat food', 'parent_id' : 4},
    {'id': 10, 'text': 'I bought cat food at the pet store', 'parent_id' : 3},
    {'id': 11, 'text': 'Dogs do not like cat food','parent_id' : 1},
    {'id': 12, 'text': 'My cat loves dog food', 'parent_id' : 4},
    {'id': 13, 'text': 'The cat does not like the dog','parent_id' : 6},
    {'id': 14, 'text': 'My cat is fat','parent_id' : 2},
    {'id': 15, 'text': 'My dog loves cats', 'parent_id' : 5},
    {'id': 16, "text": "Do you happen to hava a feline?", "parent_id":2},
    {'id': 17, "text": "Does your dog have a cat?", "parent_id": 2},
    {'id': 18, "text": "My dog loves a cat", "parent_id":1},
    {'id': 19, "text": "My dog likes cats", "parent_id": 1},
    {'id': 20, "text": "Dogs love cat food", "parent_id": 1},
    {'id': 21, "text": "Dogs love dog food", "parent_id": 4},
    {'id': 22, "text": "My cat likes a dog", "parent_id":6}, 
    {'id': 23, "text": "The cat loves a dog", "parent_id": 6}
    ]

RECOMMENDER_DF = pd.DataFrame(data = recommend_data)
RECOMMENDER_DF

Unnamed: 0,id,text,parent_id
0,1,I have a dog,1
1,2,I have a cat,2
2,3,Last week I went to the pet store,3
3,4,My kitten likes dog food,4
4,5,My dog is hungry,5
5,6,My feline really does not like the dog,6
6,7,Do you have a dog?,1
7,8,Do you have a cat?,2
8,9,My cat is eating cat food,4
9,10,I bought cat food at the pet store,3


###### example of what the implementation would look like...

In [13]:
def main():
    """
    Example of preprocessing for human eval vs. recommender eval.
    
    - Vector space not needed in input for human eval, because we only need 
    cosine similarity between two unseen sentences. Then we compare 
    to the groundTruth/label using pearsons correlation coefficient.
    
    - For Recommender system evaluation, we need to consider how we will
    train/test split, what algorithm we will use to find nearest neighbors,
    what evaluation metric we will use (Precision/recall @k, MAP, or MRR?)
    
    
    """
    STOP = "dbfs:/FileStore/tables/stop.csv"
    clean_data = Baseline_clean(TRAINING_DATA)
    X_train_human_eval = clean_data.clean_corpus()
    tfidf = Tfidf(X_train_human_eval)
    tfidf_model, vector_space = tfidf.get_vector_space()
    
    # Model evaluation for Human Evaluation data 
    human_evaluation = HumanEval(tfidf_model, HUMAN_EVAL_DF)
    human_evaluation.print_eval()
    print('\n'*3)
    # example of Recommender Evaluation using parent_id as label
    k = 3
    X_train_recommender_eval, X_test, y_train, y_test = train_test_split(RECOMMENDER_DF)
    clean_data = Baseline_clean(X_train_recommender_eval)
    X_train_recommender_eval = clean_data.clean_corpus()
    tfidf = Tfidf(X_train_recommender_eval)
    tfidf_model, vector_space = tfidf.get_vector_space()
    recommender_evaluation = RecommenderEval(tfidf_model, vector_space, X_train_recommender_eval, X_test, y_train, y_test, k)
    recommender_evaluation.print_scores()

In [14]:
# if __name__ == '__main__'():
main()
    


y_pred:  [0.0, 0.28024581989659025, 0.8437999077938347, 0.25669023267971686]
y_true:  [0, 1, 0, 1]
pearsons correlation coeff:  -0.2489591021895887
pearsons p-value:  0.7510408978104103




the precision at  3  score :  0.2222222222222222


---

In [15]:


# example of the nearest neighbors algo

import sys
from sklearn.neighbors import NearestNeighbors

X_train = [
        'I have a dog', 
        'I have a cat',
        'My kitten likes dog food', 
        'Do you have a dog?', 
        'Do you have a cat?', 
        'Last week I went to the pet store', 
        'My feline really does not like the dog', 
        'My cat is eating cat food', 
        'I bought cat food at the pet store', 
        'Dogs do not like cat food'
]
        
    
X_test = ['My cat loves dog food', 
        'The cat does not like the dog',
        'My cat is fat',
        'My dog is hungry', 
        'My dog loves cats'
    ]
model = TfidfVectorizer()
X_train_vector_space = model.fit_transform(X_train)
print(X_train_vector_space.shape)
X_test_vector_space = model.transform(X_test)
print(X_test_vector_space.shape)
# sys.exit()
# X_train_test = 
X_test_vector_space = X_test_vector_space.todense()
X_train_vector_space = X_train_vector_space.todense()

nbrs = NearestNeighbors(n_neighbors=3, algorithm = 'brute').fit(X_train_vector_space)
distances, indices = nbrs.kneighbors(X_test_vector_space)
distances, indices

(10, 26)
(5, 26)


(array([[0.88793166, 0.91877569, 1.13979778],
        [0.77673827, 1.09929736, 1.19528686],
        [0.66359456, 1.19380689, 1.25186061],
        [0.97490791, 1.11429916, 1.15636126],
        [0.96173883, 1.029728  , 1.08935316]]), array([[7, 2, 0],
        [6, 9, 8],
        [7, 1, 2],
        [7, 2, 0],
        [2, 0, 6]]))

In [16]:
data_df = []
for distance, idxs, sent in zip(distances, indices, X_test):
    for d, i in zip(distance, idxs):
        new_df = {"sentence_1":sent, "sentence_2":  X_train[i], "distance" : d}
        data_df.append(new_df)
df = pd.DataFrame(data= data_df)
df

Unnamed: 0,sentence_1,sentence_2,distance
0,My cat loves dog food,My cat is eating cat food,0.887932
1,My cat loves dog food,My kitten likes dog food,0.918776
2,My cat loves dog food,I have a dog,1.139798
3,The cat does not like the dog,My feline really does not like the dog,0.776738
4,The cat does not like the dog,Dogs do not like cat food,1.099297
5,The cat does not like the dog,I bought cat food at the pet store,1.195287
6,My cat is fat,My cat is eating cat food,0.663595
7,My cat is fat,I have a cat,1.193807
8,My cat is fat,My kitten likes dog food,1.251861
9,My dog is hungry,My cat is eating cat food,0.974908
