In [37]:
from typing import Tuple, Dict
import pandas as pd
import numpy as np
import pickle
from sklearn.manifold import TSNE
import plotly.express as px
import plotly
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix
import implicit
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

In [38]:
import warnings
warnings.filterwarnings("ignore")

In [39]:
## Load data from APP DATA folder

with open('../data/features/matrix.pkl', 'rb') as f:
    matrix = pickle.load(f)

with open('../data/features/prof_index_to_prof_name.pkl', 'rb') as f:
    prof_index_to_prof_name = pickle.load(f)

# with open('../data/features/quety_to_prof_index.pkl', 'rb') as f:
#     quety_to_prof_index = pickle.load(f)

with open('../data/features/skill_index_to_corrected.pkl', 'rb') as f:
    skill_index_to_corrected = pickle.load(f)
    
with open('../data/features/skill_original_to_index.pkl', 'rb') as f:
    skill_original_to_index = pickle.load(f)
    
skill_df = pd.read_csv('../data/features/skills.csv')
prof_df = pd.read_csv('../data/features/prof.csv')

In [40]:
prof_index_to_prof_name

{0: 'Системный аналитик',
 1: 'Администратор баз данных',
 2: 'NLP',
 3: 'Data Scientist',
 4: 'Аналитик данных',
 5: 'Аналитик BI',
 6: 'Computer Vision',
 7: 'Big Data',
 8: 'Инженер данных',
 9: 'ML инженер',
 10: 'Аналитик',
 11: 'Бизнес-аналитик',
 12: 'Продуктовый аналитик'}

План

- [x] Skill T-NSE
- [x] Skill ALS + T-NSE (нормализация, альфа...)
- [x] Proff ALS + T-NSE
- [x] можно срезы/фильтры
    - [x] наиболее популярных навыков (как считать популярность?)
    - [x] бины по зп (перестраивать матрицу?)
    - [x] отдельные профессии (можно кликать в легенде)
- [x] раскрасить можно по профессии (где он наиболее восстербован)
- [x] почитать что там я хотел сделать по раскраске
    - [x] цвет по популярной профессии
    - [x] размер по взвешенной популяности
- [x] а дейстительно, нужна ли кластеризация?

In [41]:
def prepare_plot_df(skill_df: pd.DataFrame,
                    matrix: np.array,
                    skill_index_to_corrected: Dict[int, str],
                    prof_index_to_corrected: Dict[int, str], 
                    top_n_skill_per_profession: int = 200, 
                    salary: Tuple[float, float] = None,
                    norm_type: str = 'none',
                    alg_type: str = 'tnse'
                    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Prepare datafame for plotting
    
        Parameters
        ----------
        skill_df: pd.DataFrame
            Data frame of skills
        matrix: np.array
            Skill to Profession matrix (skill in rows, professions in columns)
        skill_index_to_corrected: Dict[int, str]
            Dictionary from index to skill name
        prof_index_to_corrected: Dict[int, str]
            Dictionary from index to profession name
        top_n_skill_per_profession: int
            maximum skills for each profession
        salary: Tuple[float, float]:
            filter by salary, from and to, or None
            !!! do not set because too low vacancies whith salary
        norm_type: str
            How normalize skill-prof matrix ('none', 'skill', 'prof')
        alg_type: str
            Type of algorithm for skill map ('tnse', 'als')

        Returns
        -------
        Tuple[pd.DataFrame, pd.DataFrame] : 
            - DataFrame for skill map plot
            - DataFrame for profession map plot
    
    
    """

    def apply_filter(df, top_n_skill_per_profession, salary):
        if salary is not None:
            df = df[(df.salary_q75 > salary[0]) & (df.salary_q25 <= salary[1])]
        if top_n_skill_per_profession is not None:
            df = df.head(top_n_skill_per_profession)
        return df

    
    skill_filter_indexes = []
    for p in prof_index_to_corrected.values():
        skills_per_profession = skill_df.sort_values(p, ascending=False)
        skills_per_profession = apply_filter(skills_per_profession, top_n_skill_per_profession, salary)

        skill_filter_indexes = skill_filter_indexes + \
            [x for x in skills_per_profession.skill_id if x not in skill_filter_indexes]

    # Normalization
    m = matrix[skill_filter_indexes, :].T
    if norm_type == 'skill':
        # norm for earch skill
        m = m / np.sum(m, axis=0, keepdims=True)
    elif norm_type == 'prof':
        # norm for each prof
        m = m / np.sum(m, axis=1, keepdims=True)
    elif norm_type != 'none':
        raise ValueError("norm_type parametr must be ('none', 'skill', 'prof')")

    # als is needed for professions because T-NSE doesn't work with few objects
    csr = csr_matrix(m)
    model = implicit.als.AlternatingLeastSquares(factors=2, regularization=0.0, iterations=20, alpha=10)
    model.fit(csr)
    profs_xy = model.user_factors

    if alg_type == 'tnse':
        skills_xy = TSNE(n_components=2).fit_transform(m.T)
    elif alg_type == 'als':
        skills_xy = model.item_factors
        skills_xy = TSNE(n_components=2).fit_transform(skills_xy)
    else:
        raise ValueError("skills_xy parametr must be ('tnse', 'als')")

    df = pd.DataFrame(skills_xy).rename(columns={0:'x',1:'y'})
    df['skill_id'] = list(skill_filter_indexes)
    df['Навык'] = df.skill_id.apply(lambda x: skill_index_to_corrected[x])

    # be ensure that for every profession has skill point 
    # (one skill may be in severall professions)
    df_plot_skill = None
    for p in prof_index_to_corrected.values():

        skills_per_profession = skill_df.sort_values(p, ascending=False)
        skills_per_profession = apply_filter(skills_per_profession, top_n_skill_per_profession, salary)
        ids = skills_per_profession.skill_id.to_numpy()

        df_for_prof = df[df.skill_id.isin(ids)]
        df_for_prof['Профессия'] = p

        min_f = skills_per_profession[p].min()
        max_f = skills_per_profession[p].max()
        if max_f - min_f < 1e-10:
            max_f += 1
        size_series = skills_per_profession[p].apply(lambda x: (x - min_f) / (max_f - min_f))
        size_dict = pd.Series(size_series.values, index=skills_per_profession.skill_id).to_dict()
        df_for_prof['size'] = df_for_prof.skill_id.apply(lambda x: 15 * size_dict[x] + 0.8)

        if df_plot_skill is None:
            df_plot_skill = df_for_prof
        else:
            df_plot_skill = pd.concat([df_plot_skill, df_for_prof])

    df_plot_prof = pd.DataFrame(profs_xy).rename(columns={0:'x',1:'y'})
    df_plot_prof['Профессия'] = pd.Series(df_plot_prof.index).apply(lambda x: prof_index_to_corrected[x])

    return df_plot_skill, df_plot_prof

In [42]:
df_plot_skill, df_plot_prof = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
                                              prof_index_to_prof_name, 
                                              top_n_skill_per_profession=100,
                                              norm_type='none', alg_type='tnse')



  0%|          | 0/20 [00:00<?, ?it/s]

In [43]:
def plot_skill_map(df: pd.DataFrame, width=1000, height=600) -> plotly.graph_objs.Figure:
    fig = px.scatter(df, x='x', y='y',
                    color='Профессия', hover_name='Навык', 
                    hover_data= {'x':False, 'y':False, 'size':False, 'Профессия': False},
                    size='size',
                    color_discrete_sequence=px.colors.qualitative.Plotly,
                    title = None, width=width, height=height)

    fig.update_traces(marker=dict(opacity=0.7, line=dict(width=0.5, color='DarkSlateGrey')), 
                  selector=dict(mode='markers'))

    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)

    return fig

plot_skill_map(df_plot_skill).show()


In [44]:
def plot_prof_map(df: pd.DataFrame, width=1000, height=600) -> plotly.graph_objs.Figure:
    fig = px.scatter(df, x='x', y='y',
        color='Профессия', hover_data=['Профессия'],
        title = None, width=width, height=height)

    fig.update_traces(marker=dict(size=20, line=dict(width=0.5, color='DarkSlateGrey')), 
                      selector=dict(mode='markers'))

    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)

    return fig

plot_prof_map(df_plot_prof).show()

In [45]:
for top in [50, 100, 200]:
    for n, a in [('none', 'tnse'), ('prof', 'tnse'), ('skill', 'als')]:

        df_plot_skill, df_plot_prof = prepare_plot_df(
            skill_df, matrix, skill_index_to_corrected, prof_index_to_prof_name, 
            top_n_skill_per_profession=top, norm_type=n, alg_type=a)

        print(f'{n}-{a} {top}')
        plot_skill_map(df_plot_skill).show()
        #plot_prof_map(df_plot_prof).show()

        #df_plot_skill.to_csv(f'data/plot_df/{a}-{n}-{top}-skill.csv')
        #df_plot_prof.to_csv(f'data/plot_df/{a}-{n}-{top}-prof.csv')




  0%|          | 0/20 [00:00<?, ?it/s]

none-tnse 50


  0%|          | 0/20 [00:00<?, ?it/s]

prof-tnse 50


  0%|          | 0/20 [00:00<?, ?it/s]

skill-als 50


  0%|          | 0/20 [00:00<?, ?it/s]

none-tnse 100


  0%|          | 0/20 [00:00<?, ?it/s]

prof-tnse 100


  0%|          | 0/20 [00:00<?, ?it/s]

skill-als 100


  0%|          | 0/20 [00:00<?, ?it/s]

none-tnse 200


  0%|          | 0/20 [00:00<?, ?it/s]

prof-tnse 200


  0%|          | 0/20 [00:00<?, ?it/s]

skill-als 200


In [46]:
for top in [15]:
    for n, a in [('none', 'tnse'), ('prof', 'tnse')]:

        df_plot_skill, df_plot_prof = prepare_plot_df(
            skill_df, matrix, skill_index_to_corrected, prof_index_to_prof_name, 
            top_n_skill_per_profession=top, norm_type=n, alg_type=a)

        print(f'{n}-{a} {top}')
        plot_skill_map(df_plot_skill).show()
        #plot_prof_map(df_plot_prof).show()

        #df_plot_skill.to_csv(f'data/plot_df/{a}-{n}-{top}-skill.csv')
        #df_plot_prof.to_csv(f'data/plot_df/{a}-{n}-{top}-prof.csv')




  0%|          | 0/20 [00:00<?, ?it/s]

none-tnse 15


  0%|          | 0/20 [00:00<?, ?it/s]

prof-tnse 15
