In [1]:
# !pip install spacy
# !python -m spacy download en_core_web_lg
# !pip install bert_score
# !pip install pytorch-pretrained-bert
# !pip install spacy ftfy==4.4.3
# !pip install pytorch_transformers
# !pip install torch==1.3.0

# Load Libraries

In [1]:
from functions import *
from Sentence_ import Sentence_
from Ticket_ import Ticket_
from Ticket_Pair_ import Ticket_Pair_
import pandas as pd

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from bert_score import score

In [3]:
from nltk.translate.meteor_score import meteor_score
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/cloud_user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/cloud_user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Dataset

In [4]:
final_to_consider = pd.read_pickle('"-- Set Respective Path HERE --" ')

# Split Train/Test Data

In [5]:
sorted_idx = final_to_consider.ticket_obj.map(lambda x: x.date).sort_values().index
final_to_consider = final_to_consider.loc[sorted_idx]

train_percentage = 0.80
test_percentage = 0.20
train_break = int(final_to_consider.shape[0]*train_percentage)

train_df = final_to_consider[:train_break]
test_df = final_to_consider[train_break:]

# Embeddings Using GENSEN


In [6]:
import requests
import json

""" Run A Local Gensen API to call this function """

def get_gensen_embeddings(sentences):
    """
    Param1 - List of sentences or phrases
    
    Return - List of Embeddings (1, 2048) for each Phrase 
    """
    vec = []
    try:
        response = requests.post("http://0.0.0.0:5000/get_embeddings/", json={"sentences_list":sentences})
        vecs=np.array(json.loads(response.text)["vectors"])
    
    except Exception as e:
        print ('Exception in get_embeddings() - ', e)
        return []
        
    return vecs

In [7]:
train_emb = train_df.ticket_obj.map(lambda x: get_gensen_embeddings([x.content.text])[0])
test_emb = test_df.ticket_obj.map(lambda x: get_gensen_embeddings([x.content.text])[0])

In [8]:
train_df.loc[:,'gensen_embs'] = train_emb
test_df.loc[:,'gensen_embs'] = test_emb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [9]:
content_test_vecs = test_df.gensen_embs.values
content_train_vecs = train_df.gensen_embs.values

# Apply KNN

In [10]:
def get_neighbors_from_emb(test_vecs, train_vecs, k=5, threshold=0.8):
    
    if (len(test_vecs)==0) or (len(train_vecs)==0):
        return pd.DataFrame(columns=['neighbors'])
    
    test_emb = np.matrix(test_vecs.tolist())
    train_emb = np.matrix(train_vecs.tolist())
    
    similarity = cosine_similarity(test_emb, train_emb)
    neighbors = []
    similarity = np.round(similarity, 2)
    
    for i in range(similarity.shape[0]): 
        states = (similarity[i] >= threshold) & (similarity[0] <= 1)
        indices = np.where(states)[0]
        indices = indices[np.argsort(similarity[i][indices])[::-1][:k]]
        sim_scores = np.round(similarity[i][indices], 4)
        neighbors.append(list(tuple(zip(indices, sim_scores))))
    
    return pd.DataFrame([neighbors], index=['neighbors']).T

In [11]:
def map_best_actions(actual_actions, predicted_actions, res_vectorizer=None):
    
    if (len(actual_actions)==0) or (len(predicted_actions)==0):
        return pd.DataFrame()
    
    actual_actions_vecs = get_gensen_embeddings(actual_actions)
    predicted_actions_vecs = get_gensen_embeddings(predicted_actions)
    mapping_ = get_neighbors_from_emb(actual_actions_vecs, predicted_actions_vecs, k=1, threshold=0.0)
    
    if mapping_.shape[0] < 1:
        return pd.DataFrame()
    
    mapping_.index = actual_actions
    mapping_ = mapping_[mapping_.neighbors.map(len)>0]
    
    try:
        mapping_.neighbors = mapping_.neighbors.map(lambda x: (x[0][0], predicted_actions[x[0][0]], x[0][1]))
    except:
#         print (mapping_)
        raise Exception
    
    return mapping_

In [12]:
test_neighbors = get_neighbors_from_emb(content_test_vecs, content_train_vecs)

In [13]:
""" Map Index of Neighbor """
test_neighbors.neighbors = test_neighbors.neighbors.map(lambda x: [(train_df.iloc[[a[0]]].index[0], a[1]) for a in x])
test_df.loc[:,'all_neighbors'] = test_neighbors.neighbors.values.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


# Evaluation of Results

In [14]:
compute_meteor_score = lambda actual, prediction: round(meteor_score([actual], prediction), 4)
compute_bert_scores = lambda actual_list, pred_list : np.round((score(pred_list, actual_list, lang='en', model_type='bert-base-uncased', nthreads=32,rescale_with_baseline=True)[-1]).tolist(),4)


In [15]:
def get_mapped_data(result_df, inc='None'):
    
    if result_df.shape[0] < 1:
        return []
    
    incs = [inc]*result_df.shape[0]
    actual_labels = result_df.index.tolist()
    predicted_labels = result_df.neighbors.map(lambda x: x[1])
    return list(zip(incs, actual_labels, predicted_labels))

# Compute Meteor Score in Bunch 

In [16]:
def generate_best_meteor_scores(df_):
    """ Input : df_['best_actions'] """
    
    df_.loc[:,'best_meteor_scores'] = df_.mapped_actions.apply(lambda r: [compute_meteor_score(res[1],res[2]) for res in get_mapped_data(r)])
    return df_

# Compute BertScore in a Bunch

In [17]:
def generate_best_bert_scores(df_):
    """ Input : df_['best_actions'] """
    
    bert_score_data = df_.apply(lambda x: get_mapped_data(x['mapped_actions'], x.ticket_obj.number) , axis=1)
    bert_score_data = pd.DataFrame(list(itertools.chain(*bert_score_data.values.tolist())), columns=['number','actual','predicted'])
    bert_score_data.loc[:,'bert_score'] = compute_bert_scores(bert_score_data['actual'].values.tolist(), bert_score_data['predicted'].values.tolist())
    bert_score_data.loc[bert_score_data.bert_score<0, 'bert_score'] = 0.0
    bert_score_data.index = bert_score_data.number.values.tolist()
    df_.loc[:,'best_bert_scores'] = df_.ticket_obj.map(lambda x: bert_score_data.bert_score.loc[[x.number]].values.tolist() if x.number in bert_score_data.index else [])
    return df_
    


In [18]:
def evaluate_on_k(test_df, k=5):

    test_df.loc[:,'neighbors'] = test_df.all_neighbors.map(lambda x: x[:k])
    test_df.loc[:,'predicted_actions'] = test_df.neighbors.apply(lambda x: set(itertools.chain(*[train_df.res_actions.loc[a[0]] for a in x]))).map(list)
    test_df.loc[:,'mapped_actions'] = test_df.apply(lambda x: map_best_actions(x['res_actions'],x['predicted_actions'], None), axis=1)
    test_df = generate_best_meteor_scores(test_df)
    test_df = generate_best_bert_scores(test_df)
    test_df.loc[:,'scores_sum'] = test_df.apply(lambda x: (np.mean([x['best_meteor_scores'], x['best_bert_scores']], axis=0).sum()), axis=1) 
    test_df.loc[:, 'recall'] = test_df.apply(lambda x: x['scores_sum']/len(x['res_actions']), axis=1).values.tolist()
    test_df.loc[:, 'precision'] = test_df.apply(lambda x: x['scores_sum']/len(x['predicted_actions']) if len(x['predicted_actions'])!=0 else 0, axis=1).values.tolist()
    test_df.loc[:, 'precision'] = test_df.precision.map(lambda x: 1.0 if x > 1.0 else x)
    test_df.loc[:, 'recall'] = test_df.recall.map(lambda x: 1.0 if x > 1.0 else x)
    beta = 1
    test_df.loc[:, 'f1_score'] = (((1+beta**2)*test_df['precision']*test_df['recall'])/((beta**2)*test_df['precision']+test_df['recall'])).fillna(0).values.tolist()


    return test_df[['precision','recall','f1_score']].mean()
    

In [19]:
results = pd.DataFrame(columns=['K','Average Precision','Average Recall','Average F1-Score'])
for k in range(1,11):
    r = evaluate_on_k(test_df, k=k)
    results.loc[k] = [k]+r.values.round(4).tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

In [20]:
results.to_pickle('../data/gensen_results.pkl')

In [21]:
results

Unnamed: 0,K,Average Precision,Average Recall,Average F1-Score
1,1.0,0.4308,0.4307,0.4208
2,2.0,0.4058,0.4879,0.4258
3,3.0,0.3937,0.498,0.4187
4,4.0,0.384,0.511,0.4126
5,5.0,0.3757,0.5204,0.406
6,6.0,0.3757,0.5204,0.406
7,7.0,0.3757,0.5204,0.406
8,8.0,0.3757,0.5204,0.406
9,9.0,0.3757,0.5204,0.406
10,10.0,0.3757,0.5204,0.406
