In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_lg
# !pip install bert_score
# !pip install fasttext==0.9.1

# Load Libraries

In [1]:
from functions import *
from Sentence_ import Sentence_
from Ticket_ import Ticket_
from Ticket_Pair_ import Ticket_Pair_
import pandas as pd

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from bert_score import score

In [4]:
from nltk.translate.meteor_score import meteor_score
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/cloud_user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load Dataset

In [5]:
final_to_consider = pd.read_pickle('"-- Set Respective Path HERE --" ')

# Embeddings Using Fasttext

In [6]:
import unidecode
import fasttext

In [None]:
contents = final_to_consider.ticket_obj.apply(lambda x: [a.text.lower().strip() for a in x.content_sents])
resolutions = final_to_consider.ticket_obj.apply(lambda x: [a.text.lower().strip() for a in x.resolution_sents])
contents = list(itertools.chain(*contents.values.tolist()))
resolutions = list(itertools.chain(*resolutions.values.tolist()))
all_sents = contents+resolutions

training_data_path = '../data/fasttext_unsup_data.txt'
with open(training_data_path, 'w') as f:
    f.write(unidecode.unidecode('\n'.join(all_sents)))

In [None]:
model = fasttext.train_unsupervised(training_data_path,'skipgram', lr=0.025,epoch=25,word_ngrams=2,bucket=200000,dim=300)

In [None]:
model.save_model('./custom_models/fasttext_skipgram.bin')

In [7]:
# skipgram = fasttext.load_model('./custom_models/fasttext_skipgram.bin')
# cbow = fasttext.load_model('./custom_models/fasttext_cbow.bin')
pretrained = fasttext.load_model('./custom_models/fasttext_simple_wiki.bin')
model = pretrained





In [8]:
get_fasttext_sent_emb = lambda sent : model.get_sentence_vector(unidecode.unidecode(sent.lower().replace('\n',' ')))
get_fasttext_mean_emb = lambda sent : np.sum([model.get_word_vector(word.lower()) for word in sent.split()], axis=0)


In [9]:
final_to_consider.loc[:,'fasttext_embs'] = final_to_consider.ticket_obj.map(lambda x: get_fasttext_sent_emb(x.content.text))


# Split Train/Test Data

In [10]:
sorted_idx = final_to_consider.ticket_obj.map(lambda x: x.date).sort_values().index
final_to_consider = final_to_consider.loc[sorted_idx]


In [11]:

train_percentage = 0.80
test_percentage = 0.20
train_break = int(final_to_consider.shape[0]*train_percentage)

train_df = final_to_consider[:train_break]
test_df = final_to_consider[train_break:]

In [12]:
content_test_vecs = test_df.fasttext_embs.values
content_train_vecs = train_df.fasttext_embs.values

# Apply KNN using BoW vectors

In [13]:
def get_neighbors_from_emb(test_vecs, train_vecs, k=5, threshold=0.8):
    
    if (len(test_vecs)==0) or (len(train_vecs)==0):
        return pd.DataFrame(columns=['neighbors'])
    
    test_emb = np.matrix(test_vecs.tolist())
    train_emb = np.matrix(train_vecs.tolist())
    
    similarity = cosine_similarity(test_emb, train_emb)
    neighbors = []
    similarity = np.round(similarity, 2)
    
    for i in range(similarity.shape[0]): 
        states = (similarity[i] >= threshold) & (similarity[0] <= 1)
        indices = np.where(states)[0]
        indices = indices[np.argsort(similarity[i][indices])[::-1][:k]]
        sim_scores = np.round(similarity[i][indices], 4)
        neighbors.append(list(tuple(zip(indices, sim_scores))))
    
    return pd.DataFrame([neighbors], index=['neighbors']).T

In [14]:
def map_best_actions(actual_actions, predicted_actions, res_vectorizer=None):
    
    if (len(actual_actions)==0) or (len(predicted_actions)==0):
        return pd.DataFrame()
    
    actual_actions_vecs = np.asarray([get_fasttext_sent_emb(action) for action in actual_actions])
    predicted_actions_vecs = np.asarray([get_fasttext_sent_emb(action) for action in predicted_actions])
    mapping_ = get_neighbors_from_emb(actual_actions_vecs, predicted_actions_vecs, k=1, threshold=0.0)
    
    if mapping_.shape[0] < 1:
        return pd.DataFrame()
    
    mapping_.index = actual_actions
    mapping_ = mapping_[mapping_.neighbors.map(len)>0]
    
    try:
        mapping_.neighbors = mapping_.neighbors.map(lambda x: (x[0][0], predicted_actions[x[0][0]], x[0][1]))
    except:
#         print (mapping_)
        raise Exception
    
    return mapping_

In [15]:
test_neighbors = get_neighbors_from_emb(content_test_vecs, content_train_vecs, k=10)

In [16]:
""" Map Index of Neighbor """
test_neighbors.neighbors = test_neighbors.neighbors.map(lambda x: [(train_df.iloc[[a[0]]].index[0], a[1]) for a in x])
test_df.loc[:,'all_neighbors'] = test_neighbors.neighbors.values.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


# Evaluation of Results

In [17]:
compute_meteor_score = lambda actual, prediction: round(meteor_score([actual], prediction), 4)
compute_bert_scores = lambda actual_list, pred_list : np.round((score(pred_list, actual_list, lang='en', model_type='bert-base-uncased', nthreads=32,rescale_with_baseline=True)[-1]).tolist(),4)


In [18]:
def get_mapped_data(result_df, inc='None'):
    
    if result_df.shape[0] < 1:
        return []
    
    incs = [inc]*result_df.shape[0]
    actual_labels = result_df.index.tolist()
    predicted_labels = result_df.neighbors.map(lambda x: x[1])
    return list(zip(incs, actual_labels, predicted_labels))

# Compute Meteor Score in Bunch 

In [19]:
def generate_best_meteor_scores(df_):
    """ Input : df_['best_actions'] """
    df_.loc[:,'best_meteor_scores'] = df_.mapped_actions.apply(lambda r: [compute_meteor_score(res[1],res[2]) for res in get_mapped_data(r)])
    return df_

# Compute BertScore in a Bunch

In [20]:
def generate_best_bert_scores(df_):
    """ Input : df_['best_actions'] """
    
    bert_score_data = df_.apply(lambda x: get_mapped_data(x['mapped_actions'], x.ticket_obj.number) , axis=1)
    bert_score_data = pd.DataFrame(list(itertools.chain(*bert_score_data.values.tolist())), columns=['number','actual','predicted'])
    bert_score_data.loc[:,'bert_score'] = compute_bert_scores(bert_score_data['actual'].values.tolist(), bert_score_data['predicted'].values.tolist())
    bert_score_data.loc[bert_score_data.bert_score<0, 'bert_score'] = 0.0
    bert_score_data.index = bert_score_data.number.values.tolist()
    df_.loc[:,'best_bert_scores'] = df_.ticket_obj.map(lambda x: bert_score_data.bert_score.loc[[x.number]].values.tolist() if x.number in bert_score_data.index else [])
    return df_
    


In [21]:
def evaluate_on_k(test_df, k=5):

    test_df.loc[:,'neighbors'] = test_df.all_neighbors.map(lambda x: x[:k])
    test_df.loc[:,'predicted_actions'] = test_df.neighbors.apply(lambda x: set(itertools.chain(*[train_df.res_actions.loc[a[0]] for a in x]))).map(list)
    test_df.loc[:,'mapped_actions'] = test_df.apply(lambda x: map_best_actions(x['res_actions'],x['predicted_actions'], None), axis=1)
    test_df = generate_best_meteor_scores(test_df)
    test_df = generate_best_bert_scores(test_df)
    test_df.loc[:,'scores_sum'] = test_df.apply(lambda x: (np.mean([x['best_meteor_scores'], x['best_bert_scores']], axis=0).sum()), axis=1) 
    test_df.loc[:, 'recall'] = test_df.apply(lambda x: x['scores_sum']/len(x['res_actions']), axis=1).values.tolist()
    test_df.loc[:, 'precision'] = test_df.apply(lambda x: x['scores_sum']/len(x['predicted_actions']) if len(x['predicted_actions'])!=0 else 0, axis=1).values.tolist()
    test_df.loc[:, 'precision'] = test_df.precision.map(lambda x: 1.0 if x > 1.0 else x)
    test_df.loc[:, 'recall'] = test_df.recall.map(lambda x: 1.0 if x > 1.0 else x)
    beta = 1
    test_df.loc[:, 'f1_score'] = (((1+beta**2)*test_df['precision']*test_df['recall'])/((beta**2)*test_df['precision']+test_df['recall'])).fillna(0).values.tolist()


    return test_df[['precision','recall','f1_score']].mean()

In [22]:
results = pd.DataFrame(columns=['K','Average Precision','Average Recall','Average F1-Score'])
for k in range(1,11):
    r = evaluate_on_k(test_df, k=k)
    results.loc[k] = [k]+r.values.round(4).tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

In [23]:
results

Unnamed: 0,K,Average Precision,Average Recall,Average F1-Score
1,1.0,0.4574,0.4547,0.4429
2,2.0,0.4301,0.5391,0.4563
3,3.0,0.4113,0.5753,0.4487
4,4.0,0.3923,0.5882,0.4325
5,5.0,0.3793,0.6004,0.4197
6,6.0,0.3723,0.6107,0.4123
7,7.0,0.3656,0.6182,0.4049
8,8.0,0.3611,0.6228,0.3994
9,9.0,0.3554,0.6279,0.3924
10,10.0,0.3518,0.632,0.3879


In [24]:
results.to_pickle('../data/fasttext_pretrained_results.pkl')