### Workflow

1. Per la prima stringa si suggeriscono solo le skill che iniziano per la stringa inserita fino a quel momento.\
TODO: decidere quali motrare, quelle più lunghe? a caso?


2. Per le successive stringe si considerano le skill inserite fino a quel momento: partendo dalla stringa inserita si ricavano le skill che contengono quella stringa, come in 1, poi per ognuna di queste si calcola la similarità del coseno con le skill già inserite e si suggerisce la skill con la media delle similarità più altra\
TODO: decidere quante skill mostrare. Usare la gerarchia delle skill come altra metrica di suggerimento, ad esempio in base alla distanza dal primo antenato in comune


3. suggerire anche una skill che non contenga la stringa inserita fino a quel momento ma che sia semplicemente la più simile con quelle inserite precedentemente, calcolata come per 2


4. Gestire gli errori di typo: individuarli e suggerire un'alternativa

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from bisect import bisect_left
import ipywidgets as widgets
import difflib

In [41]:
class SuggestsTextArea:
    
    def __init__(self):
        self.vectors = self.load_vectors()
        self.batch = np.array(list(self.vectors.values()))[0:100]
        self.cos_sim_matrix = pd.DataFrame(cosine_similarity(self.batch), 
                                           columns=list(self.vectors.keys())[0:100],
                                           index=list(self.vectors.keys())[0:100])

        self.layout = widgets.Layout(flex='0 1 auto', height='100px', min_height='100px', width='auto')
        self.suggest_buttons = list()
        self.textArea = widgets.Textarea(
            value='',
            placeholder='Type something',
            description='Skill:',
            disabled=False,
            tooltip='Enter the name of the Text field',
            height='90px', 
            layout=self.layout
        )
        
    
    def load_vectors(self):
        '''
        Load embeddings vectors, return dict skill: vector
        '''
        with open("ft_vectors_cbow_50_10_0_05.vec", "r") as f:
            res = dict()
            next(f)
            for line in f:
                skill = line.split(' ', 1)[0]
                vec = np.array(line.split(' ', 1)[1].split(' ')[:-1]).astype(np.float)
                #vec = np.array(list(line.split(' ', 1)[1].splitlines()))
                res[skill] = vec
            return res
    
    
    def find_sub_sting(self, word, words_list):
        '''
        Find all skill that contain the new string insert by user
        '''
        return [w for w in sorted(words_list) if w.lower().startswith(word.lower())]
    
    
    def get_best_similarity_skill(self, context, new_input):
        '''
        Get the four most similarity skills respect of the user's input
        '''
        find_suggests = self.find_sub_sting(new_input, list(self.vectors.keys())[0:100])[:4]
        candidate_skill = self.cos_sim_matrix.loc[context, find_suggests]
        return candidate_skill.mean(axis = 0).sort_values(ascending=False)
        #return candidate_skill.mean(axis = 0).idxmax(), candidate_skill.mean(axis = 0).max()
        
    
    def show_textarea(self):
        '''
        Show the text area widget
        '''
        display(self.textArea)
        self.textArea.observe(self.suggests_manager, names='value')
        
        
    def clear_skill(self, skill):
        '''
        Replace the _ whith space
        '''
        return skill.replace('_', ' ')
    
    
    def suggests_manager(self, widget):
        '''
        Main function of the class, manage the suggestions in different case
        '''
        self.close_widgets()
        old_input = widget['old']
        new_input = widget['new'].split(' ')[-1]
        context = old_input.split(', ')[:-1]
        if context != []:
            sug = self.get_best_similarity_skill(context, new_input)
            for index, value in sug.items():
                b = widgets.Button(
                    description=f'{index} - {round(value*100, 2)}%', # fix negative similarity: e.g. -35%
                    disabled=False,
                    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
                    tooltip='Click me'
                )

                b.on_click(self.my_event_handler)
                self.suggest_buttons.append(b)
            
        else:
            if len(new_input)>3:
                find_suggests = self.find_sub_sting(new_input, list(self.vectors.keys())[0:100])[:4]
                grammatical_suggests = [self.clear_skill(s) for s in find_suggests]
                    
                    
                for sub in grammatical_suggests:

                    b = widgets.Button(
                        description=sub,
                        disabled=False,
                        button_style='', # 'success', 'info', 'warning', 'danger' or ''
                        tooltip='Click me'
                    )

                    b.on_click(self.my_event_handler)
                    self.suggest_buttons.append(b)
        display(widgets.HBox(self.suggest_buttons))


    def my_event_handler(self, btn_object):
        '''
        Button click handler, add the skill pressed to the text area
        '''
        skill_list = self.textArea.value.split(' ')
        skill_list[-1] = btn_object.description.split(' ')[0]
        new_contest = ' '.join(skill_list) + ', '
        self.textArea.value = new_contest
        self.close_widgets()
    
    
    def close_widgets(self):# fix button go down on every input
        '''
        Remove button of old suggestions
        '''
        for w in self.suggest_buttons:
            w.close()
        self.suggest_buttons = list()

    
    

In [39]:
t = SuggestsTextArea()
t.show_textarea()

Textarea(value='', description='Skill:', layout=Layout(flex='0 1 auto', height='100px', min_height='100px', wi…

HBox()

HBox()

HBox()

HBox(children=(Button(description='manage', style=ButtonStyle(), tooltip='Click me'), Button(description='mana…

HBox()

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox()

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox()

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox(children=(Button(button_style='success', description='technical - 44.0%', style=ButtonStyle(), tooltip='C…

HBox(children=(Button(button_style='success', description='teams - 41.85%', style=ButtonStyle(), tooltip='Clic…

HBox(children=(Button(button_style='success', description='teams - 41.85%', style=ButtonStyle(), tooltip='Clic…

HBox(children=(Button(button_style='success', description='ability - 44.74%', style=ButtonStyle(), tooltip='Cl…

HBox(children=(Button(button_style='success', description='ability - 29.79%', style=ButtonStyle(), tooltip='Cl…

HBox(children=(Button(button_style='success', description='ability - 29.79%', style=ButtonStyle(), tooltip='Cl…

In [87]:
list(res.keys())[0:100]

['team',
 'development',
 'business',
 '</s>',
 'support',
 'knowledge',
 'software',
 'data',
 'technical',
 'management',
 'client',
 'systems',
 'based',
 'solutions',
 'including',
 'design',
 'projects',
 'services',
 'technology',
 'environment',
 'engineer',
 'clients',
 'project',
 'ensure',
 'provide',
 'developer',
 'manager',
 'understanding',
 'technologies',
 'service',
 'delivery',
 'ability',
 'develop',
 'engineering',
 'security',
 'leading',
 'teams',
 'quality',
 'training',
 'customers',
 'product',
 'essential',
 'experienced',
 'system',
 'deliver',
 'responsible',
 'help',
 'products',
 'process',
 'global',
 'communication',
 'customer',
 'organisation',
 'developing',
 'tools',
 'able',
 'use',
 'manage',
 'infrastructure',
 'testing',
 'processes',
 'digital',
 'responsibilities',
 'build',
 'agile',
 'analysis',
 'industry',
 'offer',
 'providing',
 'java',
 'network',
 'office',
 'managing',
 'analyst',
 'supporting',
 'performance',
 'test',
 'staff',
 'pla

In [113]:
df = pd.DataFrame({"A":[12, 4, 5, 44, 1], 
                   "B":[50, 2, 54, 3, 2],  
                   "C":[20, 16, 7, 3, 8], 
                   "D":[14, 3, 17, 2, 6]}) 

In [114]:
df.mean(axis = 0).idxmax()

'B'

In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

In [4]:
res = dict()
with open("ft_vectors_cbow_50_10_0_05.vec", "r") as f:
    res = dict()
    next(f)
    for line in f:
        skill = line.split(' ', 1)[0]
        vec = np.array(line.split(' ', 1)[1].split(' ')[:-1]).astype(np.float)
        res[skill] = vec

In [35]:
batch = np.array(list(res.values())[0:100])

cos_sim_matrix = cosine_similarity(batch)

In [47]:
df = pd.DataFrame(cos_sim_matrix, 
                  columns=list(res.keys())[0:100],
                  index=list(res.keys())[0:100])

In [91]:
df.loc[['team', 'support'], ['team']]

Unnamed: 0,team
team,1.0
support,0.458356


In [51]:
df

Unnamed: 0,team,development,business,</s>,support,knowledge,software,data,technical,management,...,cloud,complex,implementation,range,developers,delivering,needs,architecture,stakeholders,professional
team,1.000000,0.546168,0.407871,-0.367190,0.458356,-0.211330,0.362891,-0.064269,0.403233,0.107772,...,-0.011570,0.141126,0.213786,0.286247,0.753220,0.459311,0.211625,0.156031,0.319186,0.422196
development,0.546168,1.000000,0.313898,-0.421346,0.348559,0.409568,0.762928,-0.003895,0.595459,0.317950,...,0.138863,0.392033,0.614209,0.427812,0.682206,0.534964,0.175480,0.566459,0.253751,0.336354
business,0.407871,0.313898,1.000000,-0.272169,0.415405,-0.044261,0.172432,0.327413,0.520879,0.405341,...,0.058435,0.476450,0.527861,0.322919,0.313967,0.572787,0.631644,0.353716,0.711306,0.306167
</s>,-0.367190,-0.421346,-0.272169,1.000000,-0.324000,-0.182697,-0.357393,-0.157324,-0.366929,-0.312721,...,-0.054036,-0.466842,-0.385026,-0.343838,-0.324776,-0.528286,-0.275094,-0.335376,-0.232023,-0.207505
support,0.458356,0.348559,0.415405,-0.324000,1.000000,0.039474,0.233995,0.043679,0.536830,0.505280,...,-0.022496,0.208792,0.569804,0.383999,0.174114,0.425402,0.379612,0.085279,0.294910,0.401248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
delivering,0.459311,0.534964,0.572787,-0.528286,0.425402,0.161763,0.365145,0.091258,0.537797,0.475968,...,0.154979,0.586657,0.590884,0.480128,0.333837,1.000000,0.427695,0.384868,0.499087,0.423507
needs,0.211625,0.175480,0.631644,-0.275094,0.379612,-0.084129,-0.003881,0.141243,0.439825,0.125760,...,-0.063518,0.416024,0.346387,0.307621,0.203530,0.427695,1.000000,0.261225,0.576213,0.304516
architecture,0.156031,0.566459,0.353716,-0.335376,0.085279,0.364180,0.484540,0.296331,0.595392,0.318008,...,0.575766,0.505697,0.647815,0.121020,0.327489,0.384868,0.261225,1.000000,0.354468,-0.016229
stakeholders,0.319186,0.253751,0.711306,-0.232023,0.294910,0.081870,0.032649,0.245904,0.538241,0.486596,...,-0.077411,0.513087,0.497280,0.221552,0.253490,0.499087,0.576213,0.354468,1.000000,0.248894


In [27]:
cos_sim_matrix

array([[ 1.        ,  0.54616796,  0.40787082, ...,  0.15603149,
         0.31918593,  0.42219576],
       [ 0.54616796,  1.        ,  0.31389837, ...,  0.56645873,
         0.25375117,  0.33635385],
       [ 0.40787082,  0.31389837,  1.        , ...,  0.35371594,
         0.71130618,  0.30616668],
       ...,
       [ 0.15603149,  0.56645873,  0.35371594, ...,  1.        ,
         0.35446754, -0.01622902],
       [ 0.31918593,  0.25375117,  0.71130618, ...,  0.35446754,
         1.        ,  0.24889377],
       [ 0.42219576,  0.33635385,  0.30616668, ..., -0.01622902,
         0.24889377,  1.        ]])

In [12]:
e[0:2]

array([[ 3.6812e+00,  8.0038e+00, -1.1202e+01, -5.5041e+00, -9.2364e-01,
        -3.0491e+00,  3.9270e+00, -1.1008e+00,  2.2651e+00,  1.7978e+00,
        -2.5573e+00, -2.8215e+00,  5.7468e+00,  5.0150e+00, -1.2353e+00,
        -1.2950e+00, -4.9979e+00, -1.8071e+00, -3.0039e+00, -1.1048e+00,
         1.1357e+01, -2.4231e+00, -4.2181e+00,  3.7512e+00, -1.0883e+01,
        -5.0247e+00, -3.0746e-01,  3.2239e+00,  2.9669e+00, -2.9056e+00,
         2.5248e+00,  2.3988e+00, -5.9446e+00, -1.3846e+00,  6.4368e+00,
         2.0415e+00, -1.7573e+00, -4.9802e+00, -3.3233e+00,  6.0821e+00,
         2.4876e+00,  3.3052e+00,  1.0224e+00, -5.2584e+00, -5.8962e+00,
         4.2960e+00,  5.1563e+00,  7.1303e-01, -3.6906e+00,  4.7729e+00],
       [ 3.7171e+00,  7.4125e-01, -2.3379e+00, -1.3539e-01, -3.1659e+00,
        -2.2374e+00,  1.6443e+00, -4.9635e-01, -2.6406e+00,  2.0127e+00,
        -4.4795e-01,  3.8531e-01,  1.0746e+00, -1.5886e+00, -1.8618e+00,
        -3.8247e-01, -7.1371e-01, -2.3662e+00, -3.

In [21]:
len(e)

113656

In [49]:
mat = np.concatenate( e, axis=0 )

In [51]:
mat

array([  3.6812,   8.0038, -11.202 , ...,  -6.333 ,  -2.4687,   1.0271])

In [20]:
len(e)

113656

In [19]:
dist_out = cosine_similarity(e,e, dense_output=False)

TypeError: paired_cosine_distances() missing 1 required positional argument: 'Y'

In [18]:
dist_out[100000]

0.0