# Designing a path for career conversion
### Prathamesh Karve

## Import necessary libraries

In [66]:
import os
import json
import pandas as pd

## Load the scraped profiles in pandas dataframe

In [78]:
path_to_json = '.'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
jsons_data = pd.DataFrame(columns=['name','is_intern','positions','educations','skills'])

### Data cleaning

In [79]:
# Function to detect acronym in skill name and discard the full name
def keepShorter(skill):
    inner = None
    if '(' in skill:
        inner  = skill.split('(', 1)[1].split(')')[0]
        before = skill.split('(')[0]
    
    if(inner):
        return before if len(before) < len(inner) else inner
    else:
        return skill
    
    print(skill)
    regPar = r'\(([^)]+)'
    match  = re.search('\(([^)]+)', skill)
    if(match):
        outer  = match.group(0)
        inner  = match.group(1)
        third  = match.group(2)
        print(inner, outer, third)
        
    else:
        return skill

for index, js in enumerate(json_files):
    with open(os.path.join(path_to_json, js)) as json_file:
        json_text   = json.load(json_file)
        
        name        = json_text['profile']['name']
        
        # Scan the headline to determine if the person is Intern or not
        is_intern   =  1 if ('Intern ' in json_text['profile']['headline']) else 0
        
        positions   = json_text['positions'][0]
        educations  = json_text['educations']
        
        # Process the skills json object to retain only a dictionary of skills
        skills_dict = {}
        for skill in json_text['skills']:
            if('count' in skill):
                skills_dict[skill['title']] = int(skill['count']) + 1
            else:
                skills_dict[skill['title']] = 1
                
        # Convert all the skills to lowercase (NodeJS == nodejs)
        skills_dict = {k.lower() : v for k,v in skills_dict.items()}
         
        # Remove all whitespaces (node js == nodejs)
        skills_dict = {k.translate({32:None}) : v for k,v in skills_dict.items()}
        
        # Remove all hyphens (tcp-ip == tcpip)
        skills_dict = {k.translate({45:None}) : v for k,v in skills_dict.items()}
        
        # Remove all dots (node.js == nodejs)
        skills_dict = {k.translate({46:None}) : v for k,v in skills_dict.items()}
        
        # Keep only acronyms (amazonwebservices(aws) === aws)
        skills_dict = {keepShorter(k) : (v + (skills_dict[keepShorter(k)] if keepShorter(k) in skills_dict else 0)) for k,v in skills_dict.items()}
        
        # Add the processed skills to data frame
        skills = skills_dict
                
        jsons_data.loc[index] = [name, is_intern, positions, educations, skills]
        
print(jsons_data)

                       name is_intern  \
0              Saumya          1   
1               Sreya          0   
2             Maithili          0   
3               Vaidehi          0   
4              Amruta          1   
5            Sweety          1   
6                 Kriti          1   
7                Riti          1   
8         Apoorv          0   
9               Pooja          0   
10       Anusha          0   
11            Rajas          1   
12            Nikita          0   
13             Varsha          0   
14          Akanksha          0   
15      Chandni          0   
16            Poonam          0   
17          Kunal          1   
18              Mayur          1   
19          Nandini          1   
20                Yash          1   
21              Neha          1   
22       Bhaumik          0   
23           Diksha          0   
24        Nayana          1   
25      Deepika          0   
26          Rachel          0   
27             Supriya          1

### Data frame preparation - Keep profiles that are interns

In [161]:
# Filtering the dataframe to retain only the target position - 'Intern'
intern_df     = jsons_data[(jsons_data['is_intern'] == 1)]

# Adding the targetuser to dataframe
target_df     = jsons_data[(jsons_data['name'] == "Prathamesh Karve")]

prediction_df = pd.concat([intern_df, target_df])

skills_df     = prediction_df['skills'].apply(pd.Series)
skills_df['name'] = jsons_data['name']

## Analysis - Collaborative filtering

### Get similar profiles

In [163]:
N = 10
s1 = skills_df.rank(axis=1, method='max', ascending=False)
s2 = skills_df.mask(skills_df.rank(axis=1, method='max', ascending=False) > N, 0)
s2 = s2.fillna(0)
s2 = s2.loc[:, (s2 != 0).any(axis=0)]

for col in s2.columns:
    if len(s2[col].unique()) == 2:
        s2.drop(col,inplace=True, axis=1)
    elif s2[col][57] == 0:
        s2.drop(col,inplace=True, axis=1)
        
import numpy as np
s2 = s2.loc[np.sum(s2.iloc[:,:-1]>0, axis=1)[lambda x: x>=3].index]
s2['name'] = jsons_data['name']

similar_profiles = s2['name'].tolist()

print("Profiles similar to target user Prathamesh Karve are")
for name in similar_profiles:
    print("\n", name)

Profiles similar to target user Prathamesh Karve are

 Saumya 

 Amruta 

 Sweety 

 Riti 

 Neha 

 Nayana 

 Priyanka 

 Jacky C.

 Saket 

 Anoop 

 Ashish 

 Prathamesh 


### Recommend skills based on similar profiles

In [164]:
s3 = skills_df[~skills_df.index.isin(s2.index)]
for col in s3.columns:
    if col in s2.columns:
        s3.drop(col,inplace=True,axis=1)

s4 = s3.sum()

answer = s4.nlargest(n=10)
answer

datastructures     52.0
nodejs             48.0
javascript         46.0
microsoftoffice    44.0
corejava           42.0
mysql              38.0
reactjs            38.0
sql                32.0
mongodb            30.0
linux              28.0
dtype: float64