### Recommend skill from job title

In [1]:
import pandas as pd

In [2]:
def load_data():
    df_title_skills = pd.read_csv("top_skills.csv")
    df_skills = df_title_skills.groupby("job_title")["recommended_skills_tfidf"].apply(list)
    return df_skills.to_dict()

In [3]:
title_skills_dict = load_data()

In [4]:
def find_skills(input_title, top_n=5):
    try:
        return title_skills_dict[input_title][:top_n]
    except KeyError:
        return None

In [5]:
find_skills("data scientist")

['Data Science', 'Python', 'Machine Learning', 'Algorithm', 'Statistics']

In [6]:
find_skills("python developer")

['Python',
 'Django',
 '(Structured Query Language) SQL',
 'Amazon Web Services (AWS)',
 'Software Development']

In [7]:
find_skills("business developer")

['Business Development Techniques',
 'English',
 'Prospecting',
 'B2B',
 'Customer Relationship Management (CRM)']

### Recommend skills from skills

In [8]:
def load_skill_matrix():
    """Load skill corelation matrix"""
    df_input = pd.read_csv("skill_matrix.csv")
    skills_dict = df_input.groupby("skill_id_1")[["skill_id_2", "rescaled_factorized_correlation"]].apply(lambda x: [tuple(x.iloc[i]) for i in range(1, len(x))])
    return skills_dict

In [9]:
def load_id_skill_dict():
    """Load id and skill dictionary"""
    df_input = pd.read_excel("skill_matrix_production.xlsx")
    id_skill_dict = dict(zip(df_input["skill_id"], df_input["skill_name_en"]))
    skill_id_dict = dict(zip(df_input["skill_name_en"], df_input["skill_id"]))
    return id_skill_dict, skill_id_dict

In [20]:
def find_skills(input_skills, top_n=10):
    """Find all correlated skills and sort them"""
    skill_ids = [skill_id_dict[skill] for skill in input_skills]
    out_list = []
    for _id in skill_ids:
        out_list += skill_dict[_id]
    sorted_ids = sort_skills(out_list)
    out_skill_name = [id_skill_dict[x] for x in sorted_ids if x not in skill_ids]
    return out_skill_name[:top_n]

In [16]:
def sort_skills(input_skill_score):
    """Sort skills by freqency and correlation score"""
    df_temp = pd.DataFrame(input_skill_score, columns=["id", "score"])
    df = df_temp.groupby("id").mean()
    df["freq"] = df_temp["id"].value_counts()
    df_out = df.sort_values(by=["freq", "score"], ascending=[False, False])
    out_id = list(df_out.index.astype(int))
    return out_id

In [12]:
skill_dict = load_skill_matrix()

In [13]:
id_skill_dict, skill_id_dict = load_id_skill_dict()

In [21]:
find_skills(["Python", "Java", "Django"])

['Programming',
 'C++',
 'HTML5',
 'Java - Hibernate',
 'XML - REST',
 'Perl',
 'Representational state transfe (REST)',
 'XML',
 'Java - Web Application',
 'Ruby ']