In [17]:
import pandas as pd
import os
import jsonpickle

from typing import Tuple, Dict, Union, List
import pandas as pd
import numpy as np
import pickle
from sklearn.manifold import TSNE
import plotly.express as px
import plotly
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix
import implicit
from implicit.nearest_neighbours import bm25_weight, tfidf_weight
from scipy.spatial.distance import cdist

import umap
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.sparse.linalg import svds
from sklearn.decomposition import PCA


In [18]:
import warnings
warnings.filterwarnings("ignore")

In [19]:
RANDOM_STATE = 42

In [20]:
def load(config_path : str) -> object:
    """load config from file
    exsabple of using: 
        self.config : ParserConfig = config.load(config_path) or ParserConfig()
        """

    jsonpickle.set_preferred_backend('json')
    jsonpickle.set_encoder_options('json', ensure_ascii=False)

    if config_path is not None and os.path.isfile(config_path):
        with open(config_path, 'r', encoding='utf-8') as f:
            return jsonpickle.decode(f.read())
    else:
        return None


In [21]:
# load reference pairs
skill_pairs : List[List[str]] = load("../cnf/skill_pairs.json")

# remove duplicates
print('all pairs count =', len(skill_pairs))
skill_pairs = [x for x in skill_pairs if [x[1], x[0]] not in skill_pairs]
print('unique pairs count =', len(skill_pairs))

all pairs count = 215
unique pairs count = 165


In [22]:
## Load data from APP DATA folder

with open('../data/features/matrix.pkl', 'rb') as f:
#with open('../data/features/matrix_name_tfidf.pkl', 'rb') as f:
#with open('../data/features/matrix_description_name_tfidf.pkl', 'rb') as f:
    matrix = pickle.load(f)

with open('../data/features/prof_index_to_prof_name.pkl', 'rb') as f:
    prof_index_to_prof_name = pickle.load(f)

# with open('../data/features/quety_to_prof_index.pkl', 'rb') as f:
#     quety_to_prof_index = pickle.load(f)

with open('../data/features/skill_index_to_corrected.pkl', 'rb') as f:
    skill_index_to_corrected = pickle.load(f)
    
with open('../data/features/skill_original_to_index.pkl', 'rb') as f:
    skill_original_to_index = pickle.load(f)
    
skill_df = pd.read_csv('../data/features/skills.csv')
prof_df = pd.read_csv('../data/features/prof.csv')

In [23]:
def prepare_plot_df(skill_df: pd.DataFrame,
                    matrix: np.array,
                    skill_index_to_corrected: Dict[int, str],
                    prof_index_to_corrected: Dict[int, str], 
                    top_n_skill_per_profession: int = 200, 
                    salary: Union[Tuple[float, float], None] = None,
                    norm_type: str = 'none',
                    factor_alg_type: str = 'none',
                    perplexity: int = 30,
                    early_exaggeration=12,
                    learning_rate=200,
                    dim: int = 5,
                    use_tsne: bool = True
                    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Prepare datafame for plotting
    
        Parameters
        ----------
        skill_df: pd.DataFrame
            Data frame of skills
        matrix: np.array
            Skill to Profession matrix (skill in rows, professions in columns)
        skill_index_to_corrected: Dict[int, str]
            Dictionary from index to skill name
        prof_index_to_corrected: Dict[int, str]
            Dictionary from index to profession name
        top_n_skill_per_profession: int
            maximum skills for each profession
        salary: Tuple[float, float]:
            filter by salary, from and to, or None
            !!! do not set because too low vacancies whith salary
        norm_type: str
            How normalize skill-prof matrix ('none', 'skill', 'prof', 'bm', 'tfidf')
        factor_alg_type: str
            Type of algorithm for skill map ('tsne', 'als', 'svd', 'pca', 'sim_euclid', 'sim_cos')

        Returns
        -------
        Tuple[pd.DataFrame, pd.DataFrame] : 
            - DataFrame for skill map plot
            - DataFrame for profession map plot
    
    
    """

    def apply_filter(df, top_n_skill_per_profession, salary):
        if salary is not None:
            df = df[(df.salary_q75 > salary[0]) & (df.salary_q25 <= salary[1])]
        if top_n_skill_per_profession is not None:
            df = df.head(top_n_skill_per_profession)
        return df

    # skill_filter_indexes - skills for plot
    skill_filter_indexes = []
    for p in prof_index_to_corrected.values():
        skills_per_profession = skill_df.sort_values(p, ascending=False)
        skills_per_profession = apply_filter(skills_per_profession, top_n_skill_per_profession, salary)

        skill_filter_indexes = skill_filter_indexes + \
            [x for x in skills_per_profession.skill_id if x not in skill_filter_indexes]

    # Normalization
    m = matrix[skill_filter_indexes, :].T

    if norm_type == 'skill':
        # norm for earch skill
        m = m / np.sum(m, axis=0, keepdims=True)
    elif norm_type == 'prof':
        # norm for each prof
        m = m / np.sum(m, axis=1, keepdims=True)
    elif norm_type == 'bm':
        m = np.asarray(bm25_weight(m).todense())
    elif norm_type == 'tfidf':
        m = np.asarray(tfidf_weight(m).todense())
    elif norm_type != 'none':
        raise ValueError("norm_type parametr must be ('none', 'skill', 'prof')")

    # als is needed for professions because T-NSE doesn't work with few objects
    csr = csr_matrix(m)
    model = implicit.als.AlternatingLeastSquares(factors=2, regularization=0.0, 
        iterations=20, alpha=10, random_state=RANDOM_STATE)
    model.fit(csr, show_progress=False)
    profs_xy = model.user_factors

    use_tsne = False

    if factor_alg_type == 'svd':
        skills_xy, _, _ = svds(csr.T, k=dim, random_state=RANDOM_STATE)
    elif factor_alg_type == 'pca':
        pca = PCA(n_components=dim)
        skills_xy = pca.fit_transform(m.T)
    elif factor_alg_type == 'tsne':
        #skills_xy = TSNE(n_components=2, random_state=RANDOM_STATE).fit_transform(m.T)
        skills_xy = m.T.copy()
        use_tsne = True
    elif factor_alg_type == 'als':
        skills_xy = model.item_factors
        use_tsne = True
    elif factor_alg_type == 'sim_euclid':
        skills_xy = euclidean_distances(m.T)
        use_tsne = True
    elif factor_alg_type == 'sim_cos':     
        skills_xy = cosine_similarity(m.T)
        use_tsne = True
    elif factor_alg_type == 'umap':
#         skills_xy = cosine_similarity(m.T)
        skills_xy = umap.UMAP(n_neighbors=10,
                              transform_seed=RANDOM_STATE,
                              random_state=RANDOM_STATE).fit_transform(m.T)
    else:
        raise ValueError("skills_xy parametr must be ('tsne', 'als', 'svd', 'pca', 'sim_euclid', 'sim_cos')")
    
    if use_tsne:
        skills_xy = TSNE(n_components=2, 
                         perplexity=perplexity, 
                         early_exaggeration=early_exaggeration,
                         learning_rate=learning_rate,
                         random_state=RANDOM_STATE).fit_transform(skills_xy)


    df = pd.DataFrame(skills_xy).rename(columns={0:'x',1:'y'})
    df['skill_id'] = list(skill_filter_indexes)
    df['Навык'] = df.skill_id.apply(lambda x: skill_index_to_corrected[x])

    # be ensure that for every profession has skill point 
    # (one skill may be in severall professions)
    df_plot_skill = None
    for p in prof_index_to_corrected.values():

        skills_per_profession = skill_df.sort_values(p, ascending=False)
        skills_per_profession = apply_filter(skills_per_profession, top_n_skill_per_profession, salary)
        ids = skills_per_profession.skill_id.to_numpy()

        df_for_prof = df[df.skill_id.isin(ids)]
        df_for_prof['Профессия'] = p

        min_f = skills_per_profession[p].min()
        max_f = skills_per_profession[p].max()
        if max_f - min_f < 1e-10:
            max_f += 1
        size_series = skills_per_profession[p].apply(lambda x: (x - min_f) / (max_f - min_f))
        size_dict = pd.Series(size_series.values, index=skills_per_profession.skill_id).to_dict()
        df_for_prof['size'] = df_for_prof.skill_id.apply(lambda x: 15 * size_dict[x] + 0.8)

        if df_plot_skill is None:
            df_plot_skill = df_for_prof
        else:
            df_plot_skill = pd.concat([df_plot_skill, df_for_prof])

    if profs_xy.shape[0] < 1000:
        df_plot_prof = pd.DataFrame(profs_xy).rename(columns={0:'x',1:'y'})
        df_plot_prof['Профессия'] = pd.Series(df_plot_prof.index).apply(lambda x: prof_index_to_corrected[x])
    else:
        # это потроение по th-idf признакам, а не профессиям -> нельзя построить карту профессий
        df_plot_prof = None

    return df_plot_skill, df_plot_prof

In [24]:
prof_index_to_prof_name.values()

dict_values(['Аналитик', 'Аналитик BI', 'Big Data', 'NLP', 'Data Scientist', 'Администратор баз данных', 'Продуктовый аналитик', 'Инженер данных', 'Аналитик данных', 'Computer Vision', 'Системный аналитик', 'ML инженер', 'Бизнес-аналитик'])

In [25]:
def plot_skill_map(df: pd.DataFrame, width=1000, height=600) -> plotly.graph_objs.Figure:



    color_list = [
        '#F8A19F', '#AA0DFE', '#3283FE', '#1CBE4F', '#C4451C', '#F6222E', 
        '#FE00FA', '#325A9B', '#FEAF16', 
        '#90AD1C', '#2ED9FF', '#B10DA1',
         '#909090', '#FBE426',
        '#FA0087', '#C075A6', '#FC1CBF'
    ]


    
    # custom visualization order
    # for better reproducability
    prof_order = [
        'Data Scientist', 'ML инженер', 'Computer Vision', 'NLP',
        'Инженер данных', 'Big Data', 'Администратор баз данных', 'Аналитик данных',
        'Аналитик', 'Бизнес-аналитик', 'Продуктовый аналитик', 'Аналитик BI',
        'Системный аналитик' ]

    fig = px.scatter(df, x='x', y='y',
                    color='Профессия', hover_name='Навык', 
                    hover_data= {'x':False, 'y':False, 'size':False, 'Профессия': False},
                    size='size', category_orders={'Профессия': prof_order},
                    #color_discrete_sequence=px.colors.qualitative.Plotly,
                    color_discrete_sequence=color_list,
                    title = None, width=width, height=height)

    fig.update_traces(marker=dict(opacity=0.7, line=dict(width=0.5, color='DarkSlateGrey')), 
                  selector=dict(mode='markers'))

    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)

    return fig

In [26]:
df_plot_skill, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
                                              prof_index_to_prof_name, 
                                              top_n_skill_per_profession=100,
                                              norm_type='none', factor_alg_type='tsne')



In [27]:
df_plot_skill.drop_duplicates(subset=['x', 'y', 'skill_id', 'Навык'])

Unnamed: 0,x,y,skill_id,Навык,Профессия,size
0,15.656795,21.311247,44,Аналитическое мышление,Аналитик,15.800000
1,16.955860,22.553305,475,SQL,Аналитик,13.047110
2,16.089966,21.407011,435,Английский,Аналитик,12.093353
3,16.613667,21.205374,120,Анализ данных,Аналитик,11.182948
4,16.525524,19.591145,38,Анализ рынка,Аналитик,9.448844
...,...,...,...,...,...,...
423,12.228292,0.103422,1447,Системный подход,Бизнес-аналитик,0.869444
424,10.595517,0.796272,87,стрессоустойчивость,Бизнес-аналитик,0.800000
425,12.451846,-1.350383,1559,E-Commerce,Бизнес-аналитик,0.800000
426,11.786855,-0.947426,992,MS Project,Бизнес-аналитик,0.800000


In [28]:
df = df_plot_skill.drop_duplicates(subset=['x', 'y', 'skill_id'])
df.shape

(428, 6)

In [29]:
df.groupby(['x', 'y']).filter(lambda x: len(x) > 1)

Unnamed: 0,x,y,skill_id,Навык,Профессия,size
258,-5.855479,-0.457575,1339,Information Technology,Data Scientist,0.845181
260,-5.855479,-0.457575,1609,PD,Data Scientist,0.845181
264,-6.265494,-1.167121,1878,EAD,Data Scientist,0.8
265,-6.265494,-1.167121,4,ПВР,Data Scientist,0.8
266,-6.265494,-1.167121,1738,Search ranking,Data Scientist,0.8
284,-25.470648,-7.963351,1394,Oracle RAC,Администратор баз данных,0.938889
287,-25.470648,-7.963351,2050,barman,Администратор баз данных,0.938889


Это очень странно (выше, что разные навыки получили одну и ту же точку)

In [30]:
matrix.shape

(2160, 13)

In [31]:
np.unique(matrix, axis=0).shape

(709, 13)

Думаю объяснение выше: есть множество навыков, которые имеют абсолютно одиниковое предстваление в профессиях

Mean reciprocal rank https://en.wikipedia.org/wiki/Mean_reciprocal_rank

In [32]:
# df_plot_skill, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
#                                               prof_index_to_prof_name, 
#                                               top_n_skill_per_profession=30,
#                                               norm_type='skill', alg_type='tnse')

In [33]:
def get_mean_reciprocal_rank(df: pd.DataFrame, pairs: List[List[str]]) -> Tuple[float, int]:
    """Mean reciprocal rank https://en.wikipedia.org/wiki/Mean_reciprocal_rank
        
        Parameters
        ----------
        df: pd.DataFrame
            Data frame with columns ['x', 'y', 'Навык']
        pairs: List[List[str]]
            Target nearest skills pairs
   
        Returns
        -------
        Tuple[float, int] : 
            - mean_reciprocal_rank
            - pairs count
    """

    df = df.drop_duplicates(subset=['x', 'y', 'Навык'])

    # берем только те строки, которые есть в тестовых парах
    # берем только те пары, навыки которых есть в данных

    skills_test_set = set([])
    for p in skill_pairs:
        skills_test_set = skills_test_set.union(set(p))

    df = df[df['Навык'].apply(lambda x: x in skills_test_set)]
    df = df.reset_index(drop=True)

    su = df['Навык'].unique()
    pairs = [x for x in pairs if x[0] in su and x[1] in su]

    # считаем расстояния в наших данных
    x = df[['x', 'y']].to_numpy()
    dist = cdist(x, x, 'sqeuclidean')

    cs = 0
    for p in pairs:
        ind_p0 = df[df['Навык'] == p[0]].index[0]
        ind_p1 = df[df['Навык'] == p[1]].index[0]

        nn = np.argsort(dist[ind_p0, :])
        rank = np.where(nn==ind_p1)[0][0]
        cs += 1/(rank+1)

        nn = np.argsort(dist[ind_p1, :])
        rank = np.where(nn==ind_p0)[0][0]
        cs += 1/(rank+1)

    mean_reciprocal_rank = cs / (2 * len(pairs))
    return mean_reciprocal_rank, len(pairs)

get_mean_reciprocal_rank(df_plot_skill, skill_pairs)

(0.09023473759231868, 106)

Метрика **mean_reciprocal_rank@pairs=50**. Стремимся увеличить метрику, но сохранив количество пар, которые встречаются в данных, на уровне не менее 50




In [34]:
# оптимизация
k = 50
best_norm_type = ''
best_alg_type = ''
best_top_n_skill_per_profession = -1
best_mean_reciprocal_rank = 0
best_dim = 0
best_use_tsne = False
stat = []
for norm_type in ['none', 'skill', 'prof', 'bm', 'tfidf']:
    for alg_type, dim, use_tsne in \
        [('tsne', 5, True),
         ('als', 5, True),
         ('svd', 5, True),
         ('svd', 2, False),
         ('pca', 5, True),
         ('pca', 2, False),
         ('sim_euclid', 5, True),
         ('sim_cos', 5, True),
         ('umap', 5, True),
         ('umap', 2, False), ]:

        for top_n_skill_per_profession in range(20, 51, 1):
            df, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
                                    prof_index_to_prof_name, 
                                    top_n_skill_per_profession=top_n_skill_per_profession,
                                    norm_type=norm_type,
                                    factor_alg_type=alg_type)
            mrr, pairs_count = get_mean_reciprocal_rank(df, skill_pairs)

            print()
            print(norm_type, alg_type, dim, use_tsne, top_n_skill_per_profession)
            print(f'mrr = {mrr}, pairs_count = {pairs_count}')
            stat.append((norm_type, alg_type, dim, use_tsne, top_n_skill_per_profession, mrr, pairs_count))

            if pairs_count > k and mrr > best_mean_reciprocal_rank:
                best_mean_reciprocal_rank = mrr
                best_norm_type = norm_type
                best_alg_type = alg_type
                best_top_n_skill_per_profession = top_n_skill_per_profession
                best_dim = dim
                best_use_tsne = use_tsne

print()
print('best_mean_reciprocal_rank =', best_mean_reciprocal_rank)
print('best_norm_type =', best_norm_type)
print('best_alg_type =', best_alg_type)
print('best_dim =', best_dim)
print('best_use_tsne =', best_use_tsne)
print('best_top_n_skill_per_profession =', best_top_n_skill_per_profession)

df, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
                                    prof_index_to_prof_name, 
                                    top_n_skill_per_profession=best_top_n_skill_per_profession,
                                    norm_type=best_norm_type,
                                    factor_alg_type=best_alg_type,
                                    dim=best_dim,
                                    use_tsne=best_use_tsne)

plot_skill_map(df).show()

##!!! Возможно нужно иметь топ-н алгоритмов, потому что есть алгоритмы с одинаковой метрикой или близкой...


none tsne 5 True 20
mrr = 0.15449722368222132, pairs_count = 29

none tsne 5 True 21
mrr = 0.16442567680822862, pairs_count = 29

none tsne 5 True 22
mrr = 0.16656083112861528, pairs_count = 29

none tsne 5 True 23
mrr = 0.16374826945765963, pairs_count = 33

none tsne 5 True 24
mrr = 0.15399815723123988, pairs_count = 35

none tsne 5 True 25
mrr = 0.15226638829903297, pairs_count = 36

none tsne 5 True 26
mrr = 0.14170962241174556, pairs_count = 37

none tsne 5 True 27
mrr = 0.15457838685657838, pairs_count = 38

none tsne 5 True 28
mrr = 0.14750190919833714, pairs_count = 39

none tsne 5 True 29
mrr = 0.14534031464728203, pairs_count = 39

none tsne 5 True 30
mrr = 0.14767879977003176, pairs_count = 41

none tsne 5 True 31
mrr = 0.1481827837506516, pairs_count = 42

none tsne 5 True 32
mrr = 0.1379521666721242, pairs_count = 43

none tsne 5 True 33
mrr = 0.12241419254353796, pairs_count = 47

none tsne 5 True 34
mrr = 0.12193851249727228, pairs_count = 47

none tsne 5 True 35
mrr = 

In [35]:
# Сравнить построение графика

# norm_type skill это плохо?

# очень интересно про косинусное растояние

In [36]:
# plot_skill_map(df_plot_skill).show()

In [37]:
df, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
                                    prof_index_to_prof_name, 
                                    top_n_skill_per_profession=best_top_n_skill_per_profession,
                                    norm_type=best_norm_type,
                                    factor_alg_type=best_alg_type,
                                    dim=best_dim,
                                    use_tsne=best_use_tsne)

plot_skill_map(df).show()

In [38]:
df.to_csv(f'../notebooks/docker/front/data/best.csv')