In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
from IPython.core.display import display, HTML
display(HTML("<style> .container {width:90% !important}</style>"))

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import tensorflow
import tensorflow_hub as hub
import copy
import pickle

In [None]:
df = pd.read_csv('naukri_datasets/naukri_com-job_sample.csv')
print('Shape of the dataframe:', df.shape, '\n')
print('Columns:', df.columns, '\n')

Shape of the dataframe: (22000, 14) 

Columns: Index(['company', 'education', 'experience', 'industry', 'jobdescription',
       'jobid', 'joblocation_address', 'jobtitle', 'numberofpositions',
       'payrate', 'postdate', 'site_name', 'skills', 'uniq_id'],
      dtype='object') 



In [None]:
print('Head of the data:')
df.head()

Head of the data:


Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,uniq_id
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00 +0000,,ITES,43b19632647068535437c774b6ca6cf8
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00 +0000,,Marketing,d4c72325e57f89f364812b5ed5a795f0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,Accounts,115d28f140f694dd1cc61c53d03c66ae
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4


### Check for null values

In [None]:
print('Null values check...')
df.info()

Null values check...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   company              21996 non-null  object 
 1   education            20004 non-null  object 
 2   experience           21996 non-null  object 
 3   industry             21995 non-null  object 
 4   jobdescription       21996 non-null  object 
 5   jobid                22000 non-null  int64  
 6   joblocation_address  21499 non-null  object 
 7   jobtitle             22000 non-null  object 
 8   numberofpositions    4464 non-null   float64
 9   payrate              21903 non-null  object 
 10  postdate             21977 non-null  object 
 11  site_name            3987 non-null   object 
 12  skills               21472 non-null  object 
 13  uniq_id              22000 non-null  object 
dtypes: float64(1), int64(1), object(12)
memory usage: 2.3+ MB


### Note:
    - We will consider only jobtitle, jobdescription, industry, and skills columns.
    - Since we are doing recommendation (not real time), we will display only basic and import things on the UI
    
### Observation:
    - jobdescription, industry and skills have null values.
    - jobdescription however is an important factor here. So, we will remove null values subset of jobdescription.
    - Null values in industry and skills, we will fill it with NA.
    
    - Another thing we can do is to drop all the null values subset of skills, industry and jobdescription.
    - If the data loss is less say 5%, I think we are good to drop null values instead of filling with NA

In [None]:
dfc = copy.deepcopy(df)

dfc.dropna(subset=['jobdescription', 'skills', 'industry'], inplace=True)
dfc.shape

(21472, 14)

### Renaming skills to domain as they seems more like domain rather than skills

In [None]:
dfc.rename(columns={'skills':'sub_domain'}, inplace=True)

In [None]:
dfc = dfc[['jobtitle', 'jobdescription', 'sub_domain', 'industry']]
dfc.head()

Unnamed: 0,jobtitle,jobdescription,sub_domain,industry
0,Walkin Data Entry Operator (night Shift),Job Description Send me Jobs like this Quali...,ITES,Media / Entertainment / Internet
1,Work Based Onhome Based Part Time.,Job Description Send me Jobs like this Quali...,Marketing,Advertising / PR / MR / Event Management
2,Pl/sql Developer - SQL,Job Description Send me Jobs like this - as ...,IT Software - Application Programming,IT-Software / Software Services
3,Manager/ad/partner - Indirect Tax - CA,Job Description Send me Jobs like this - Inv...,Accounts,Banking / Financial Services / Broking
4,JAVA Technical Lead (6-8 yrs) -,Job Description Send me Jobs like this Pleas...,IT Software - Application Programming,IT-Software / Software Services


### Value counts for skills and industry

In [None]:
dfc['sub_domain'].value_counts()

IT Software - Application Programming      5989
Sales                                      2893
ITES                                       1640
Teaching                                   1091
HR                                          928
Marketing                                   868
Accounts                                    860
Production                                  667
Medical                                     418
Financial Services                          413
IT Software - Other                         407
Engineering Design                          382
IT Software - Network Administration        372
IT Software - ERP                           354
IT Software - QA & Testing                  342
IT Software - eCommerce                     336
Site Engineering                            322
IT Software - DBA                           273
IT Software - Embedded                      262
IT Hardware                                 253
Journalism                              

### Observation:
    - We can see around 45 different sub_domain.
    - We will consider top 5-6 sub_domain as is and remaining sub_domain will keep it under 'other' section.
    - All IT - Software sub_domain, we will keep it under IT - Software instead of different types of IT - Software sub_domain.

### Creating domains column
    - We will consider all IT - Software sub domains as a single domain IT - Software
    - Then keep Sales, ITES, Teaching, HR, Marketing, Accounts, Production, Medical, Financial Services as is.
    - Remaining will be under other section

In [None]:
specific_domain = ['IT - Software', 'Sales', 'ITES', 'Teaching', 'HR', 'Marketing', 'Accounts', 'Production', 'Medical', 'Financial Services']

# create a copy of sub domain and store in domains
dfc['domain'] = dfc['sub_domain']

# take all IT - Software ... in to IT - Software category
dfc['domain'] = dfc['domain'].apply(lambda x: 'IT - Software' if 'IT Software' in x else x)

# keep specific domains as is and rest will be as other
dfc['domain'] = dfc['domain'].apply(lambda x: x if x in specific_domain else 'Other')

# now check the value counts
dfc['domain'].value_counts()

IT - Software         8971
Sales                 2893
Other                 2723
ITES                  1640
Teaching              1091
HR                     928
Marketing              868
Accounts               860
Production             667
Medical                418
Financial Services     413
Name: domain, dtype: int64

# Final dataframe

In [None]:
df_final = dfc[['jobtitle', 'jobdescription', 'domain', 'industry']]

rename_cols = {'jobtitle':'Job Title', 'jobdescription':'Job Description', 'domain':'Job Domain', 'industry':'Job Industry'}
df_final.rename(columns=rename_cols, inplace=True)
df_final.reset_index(inplace=True, drop=True)

df_final.head()

Unnamed: 0,Job Title,Job Description,Job Domain,Job Industry
0,Walkin Data Entry Operator (night Shift),Job Description Send me Jobs like this Quali...,ITES,Media / Entertainment / Internet
1,Work Based Onhome Based Part Time.,Job Description Send me Jobs like this Quali...,Marketing,Advertising / PR / MR / Event Management
2,Pl/sql Developer - SQL,Job Description Send me Jobs like this - as ...,IT - Software,IT-Software / Software Services
3,Manager/ad/partner - Indirect Tax - CA,Job Description Send me Jobs like this - Inv...,Accounts,Banking / Financial Services / Broking
4,JAVA Technical Lead (6-8 yrs) -,Job Description Send me Jobs like this Pleas...,IT - Software,IT-Software / Software Services


In [None]:
# combine title and description

df_final['title_desc'] = df_final['Job Title'] + df_final['Job Description']

In [None]:
# df_final.to_csv('naukri_datasets/naukri_jobs_final.csv', index=False, compression='zip')
df_final = pd.read_csv('naukri_datasets/naukri_jobs_final.csv', compression='zip')
df_final.shape

(21472, 5)

# Embedding with Universal Sentence Encoder (batch wise)
    - Embedding with version 5 of universal sentence encoder
    - Embedding batch wise as RAM is crashing if embedding is done on the whole data aset.
    - Batch size could be anything (make sure batch size doesn't crash RAM). I took a batch size of 1.

In [None]:
# Loading universal sentence encoder model
use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder-large/5')
# use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
print('Model loaded...')
print(use_model)

Model loaded...
<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject object at 0x7fa581f6a090>


In [None]:
def embed(text):
    return np.array(use_model(text))

In [None]:
df_final = pd.read_csv('/content/drive/MyDrive/job rec/naukri_jobs_final.csv', compression='zip')
df_final.shape

(21472, 5)

In [None]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21472 entries, 0 to 21471
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Job Title        21472 non-null  object
 1   Job Description  21472 non-null  object
 2   Job Domain       21472 non-null  object
 3   Job Industry     21472 non-null  object
 4   title_desc       21472 non-null  object
dtypes: object(5)
memory usage: 838.9+ KB


In [None]:
df_final.reset_index(inplace=True, drop=True)
df_final['title_desc'] = df_final['Job Title'] + ' ' + df_final['Job Description']

In [None]:
from tqdm import tqdm

In [None]:
vector_lis = []
for i in tqdm(range(df_final.shape[0])):
    vec = embed([df_final.iloc[i]['title_desc']])[0]
    vector_lis.append(vec)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

 88%|████████▊ | 18973/21472 [1:49:58<13:03,  3.19it/s][A[A

 88%|████████▊ | 18974/21472 [1:49:58<13:15,  3.14it/s][A[A

 88%|████████▊ | 18975/21472 [1:49:59<12:18,  3.38it/s][A[A

 88%|████████▊ | 18976/21472 [1:49:59<14:53,  2.79it/s][A[A

 88%|████████▊ | 18977/21472 [1:49:59<13:03,  3.18it/s][A[A

 88%|████████▊ | 18978/21472 [1:50:00<11:58,  3.47it/s][A[A

 88%|████████▊ | 18979/21472 [1:50:00<17:23,  2.39it/s][A[A

 88%|████████▊ | 18980/21472 [1:50:01<19:18,  2.15it/s][A[A

 88%|████████▊ | 18981/21472 [1:50:01<19:11,  2.16it/s][A[A

 88%|████████▊ | 18982/21472 [1:50:02<18:43,  2.22it/s][A[A

 88%|████████▊ | 18983/21472 [1:50:02<16:25,  2.52it/s][A[A

 88%|████████▊ | 18984/21472 [1:50:02<14:08,  2.93it/s][A[A

 88%|████████▊ | 18985/21472 [1:50:02<12:14,  3.38it/s][A[A

 88%|████████▊ | 18986/21472 [1:50:03<13:10,  3.14it/s][A[A

 88%|████████▊ | 18987/21472 [1:50:03<17:07,  2.42it

In [None]:
np.array(vector_lis).shape

(21472, 512)

## Saving vectors data (numpy array in csv file with compression zip)

In [None]:
# create dataframe for job vector
job_vector = pd.DataFrame(np.array(vector_lis))

# saving job vector dataframe and compress it
job_vector.to_csv("/content/drive/MyDrive/job rec/naukri_jobs_vector_v5.csv", header=None, index=None, compression='zip')

In [None]:
job_vector.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,-0.045472,0.07146,-0.030187,-0.04141,0.010985,0.051456,0.069333,0.020661,-0.028116,0.070179,-0.104905,0.002822,0.056707,0.072289,0.034572,-0.083928,0.065649,-0.017233,0.026539,0.080006,-0.057031,-0.064252,-0.030726,0.03235,0.020321,0.091298,0.025289,-0.032306,-0.065613,0.013495,-0.038623,-0.001063,0.046724,0.039486,0.000725,-0.049093,0.041541,-0.026583,-0.058358,-0.042819,...,0.031564,0.014133,0.010509,-0.009359,0.052586,0.033609,0.004692,-0.071564,0.012792,-0.010708,-0.066096,0.024503,-0.034346,0.015208,-0.025565,0.034517,0.032887,-0.046277,-0.066495,-0.00118,-0.020082,-0.045521,-0.008466,-0.065473,-0.047744,-0.030062,0.006022,0.048359,0.007163,-0.089128,-0.031014,0.03785,-0.016142,0.029279,-0.03735,-0.034755,-0.024487,-0.063876,-0.003787,0.010696
1,-0.066249,0.048454,-0.036347,-0.050451,0.005488,0.094829,0.090995,-0.003602,-0.029313,0.048251,-0.106683,0.012071,0.052212,0.072189,0.041739,-0.072966,0.03857,-0.029835,0.039568,0.078571,-0.050963,-0.060248,-0.036994,-0.00027,0.029798,0.098069,-0.004679,-0.040748,-0.049903,0.0092,0.003562,0.017524,0.056128,0.032073,0.019196,-0.006784,0.057913,-0.029252,-0.0354,-0.009253,...,0.041241,-0.000802,-0.01814,-0.03006,0.070185,0.026579,-0.004865,-0.052849,0.022699,-0.010966,-0.06519,0.001666,-0.056327,-0.023744,-0.023202,0.013561,0.002605,-0.023109,-0.036951,-0.017538,-0.019113,-0.003385,-0.036645,-0.069568,-0.015937,-0.032916,0.016556,0.040085,-0.005618,-0.078581,-0.056853,0.048021,-0.019401,0.015078,-0.012775,-0.063182,-0.022239,-0.03175,-0.027886,0.061154
2,-0.017534,0.051128,-0.035889,-0.074988,0.052055,0.084544,0.036064,-0.024822,-0.010642,-0.045348,-0.089839,0.037066,0.060913,0.08836,0.013292,-0.022454,0.000293,-0.024308,0.088728,0.061341,-0.087928,-0.014508,-0.073385,-0.002046,-0.014006,0.061084,0.053846,-0.047498,-0.043743,-0.007631,-0.026759,-0.00499,0.014679,0.044846,-0.007908,-0.026647,0.064466,0.003481,-0.042474,0.033352,...,-0.000802,-0.025376,-0.009469,0.012632,0.055098,0.040305,0.005352,-0.015264,0.069732,-0.003097,-0.050885,-0.012859,0.02685,0.017609,-0.03805,0.040156,0.037243,-0.014652,-0.043909,-0.020636,0.035222,-0.023503,-0.004933,-0.061331,-0.029161,-0.015408,-0.016606,0.065969,-0.019323,-0.045194,-0.080842,-0.078529,-0.041243,0.007653,-0.060213,-0.066841,-0.01641,-0.043206,-0.021493,-0.070218
3,-0.036433,0.052869,-0.007762,-0.070404,0.0294,0.053589,0.010687,-0.064494,-0.047376,0.013945,-0.108124,0.063534,0.052771,0.070192,0.007651,-0.055036,0.058187,-0.032723,-0.036185,0.080419,-0.041873,-0.086401,-0.030435,0.073672,-0.050593,0.060426,0.090007,0.009259,0.01058,0.000205,-0.057469,0.016458,0.033458,0.06172,0.053738,-0.135689,0.059339,0.005411,-0.017738,-0.044591,...,0.018084,-0.022495,-0.062157,-0.078316,0.107284,-0.005163,0.039681,0.012308,0.02666,0.028668,-0.050162,0.025921,0.056229,-0.008191,-0.014741,-0.039638,0.000278,-0.007098,-0.087758,0.01481,0.013771,-0.028846,0.025622,-0.040258,-0.066899,0.022535,0.012395,0.064987,0.055937,-0.081261,-0.051371,-0.031467,0.036373,0.015546,-0.050442,-0.052462,-0.03141,-0.052107,-0.024851,0.007191
4,0.003604,0.033161,-0.017206,-0.024959,0.014022,0.088364,0.082227,-0.045984,0.101115,0.022834,-0.038688,0.001427,0.068914,0.085607,0.002116,-0.008997,0.012808,-0.070114,0.060437,0.010981,-0.071729,-0.026732,-0.08861,-0.013257,-0.032335,0.026026,0.00458,-0.026604,-0.032197,0.021774,-0.007931,0.019682,0.06308,0.056936,-0.022026,-0.029571,0.076589,0.003787,-0.021513,-0.048511,...,0.018231,0.01281,0.016405,0.008941,0.039311,0.059305,0.044622,0.003505,0.046721,-0.043482,-0.106932,-0.064669,-0.045977,0.002331,-0.009228,0.03365,0.05104,0.040971,-0.062948,-0.031599,0.013823,0.012226,0.043889,-0.066501,-0.02108,-0.020106,-0.033303,0.080787,0.021315,-0.035483,-0.064646,0.00931,-0.009706,0.012543,-0.084817,-0.090825,0.003943,-0.034387,-0.026795,-0.017972


## Saving jobs data (csv file with compression zip)
    - and remove unnecessary column title_desc

In [None]:
df_fin = df_final[['Job Title', 'Job Description', 'Job Domain', 'Job Industry']]
df_fin.head()

Unnamed: 0,Job Title,Job Description,Job Domain,Job Industry
0,Walkin Data Entry Operator (night Shift),Job Description Send me Jobs like this Quali...,ITES,Media / Entertainment / Internet
1,Work Based Onhome Based Part Time.,Job Description Send me Jobs like this Quali...,Marketing,Advertising / PR / MR / Event Management
2,Pl/sql Developer - SQL,Job Description Send me Jobs like this - as ...,IT - Software,IT-Software / Software Services
3,Manager/ad/partner - Indirect Tax - CA,Job Description Send me Jobs like this - Inv...,Accounts,Banking / Financial Services / Broking
4,JAVA Technical Lead (6-8 yrs) -,Job Description Send me Jobs like this Pleas...,IT - Software,IT-Software / Software Services


In [None]:
df_fin.to_csv('/content/drive/MyDrive/job rec/naukri_jobs_data.csv', index=None, compression='zip')

# Recommendation

In [None]:
# load jobs vector
jobs_vect_df = pd.read_csv('/content/drive/MyDrive/job rec/naukri_jobs_vector_v5.csv', header=None, compression='zip')

# convert jobs vector dataframe to numpy array
jobs_vect = np.array(jobs_vect_df.to_numpy().tolist())
# jobs_vect = np.round(jobs_vect, 4)
jobs_vect.shape

(21472, 512)

In [None]:
jobs_vect[:2]

array([[-0.04547227,  0.0714604 , -0.03018687, ..., -0.06387586,
        -0.00378734,  0.01069614],
       [-0.066249  ,  0.04845431, -0.03634712, ..., -0.03175002,
        -0.02788561,  0.06115414]])

In [None]:
# load jobs data
jobs_df = pd.read_csv('/content/drive/MyDrive/job rec/naukri_jobs_data.csv', compression='zip')
jobs_df.shape

(21472, 4)

In [None]:
jobs_df.head()

Unnamed: 0,Job Title,Job Description,Job Domain,Job Industry
0,Walkin Data Entry Operator (night Shift),Job Description Send me Jobs like this Quali...,ITES,Media / Entertainment / Internet
1,Work Based Onhome Based Part Time.,Job Description Send me Jobs like this Quali...,Marketing,Advertising / PR / MR / Event Management
2,Pl/sql Developer - SQL,Job Description Send me Jobs like this - as ...,IT - Software,IT-Software / Software Services
3,Manager/ad/partner - Indirect Tax - CA,Job Description Send me Jobs like this - Inv...,Accounts,Banking / Financial Services / Broking
4,JAVA Technical Lead (6-8 yrs) -,Job Description Send me Jobs like this Pleas...,IT - Software,IT-Software / Software Services


In [None]:
def recommend_jobs(dfr, vector, top=5):
    
#     dfr.reset_index(inplace=True, drop=True)
    print('Please enter your job query...')
    query = input()
    print('-'*30, '\n')
    query_vect = embed([query])
    
    similarity_scr = np.inner(query_vect, vector)
    idx = np.argsort(similarity_scr)
    idx_list = idx[0][::-1]
    print(similarity_scr[:10])
    
    print('Showing top {} results'.format(top))
    print('-'*25, '\n')
    for i in range(top):
        job_idx = idx_list[i]
        print('Title:', dfr['Job Title'].iloc[job_idx])
        print('Domain:', dfr['Job Domain'].iloc[job_idx])
        print('Industry:', dfr['Job Industry'].iloc[job_idx])
        print('Description:', dfr['Job Description'].iloc[job_idx])
        print('\n','-'*50, '\n')

In [None]:
recommend_jobs(jobs_df, jobs_vect, 5)

Please enter your job query...
php sql java account
------------------------------ 

[[0.11981332 0.12408762 0.30439122 ... 0.23272319 0.16072418 0.02555727]]
Showing top 5 results
------------------------- 

Title: PHP / My SQL - Junior Programmer
Domain: IT - Software
Industry: Gems / Jewellery
Description: Job Description   Send me Jobs like this Qualifications : PHP 5.x  MySQL 4+ XHTML / HTML 5 CSS / JQuery/Java script / AJAX Proficiency in writing cross-browser coding standards  An in-depth understanding of Object Oriented concepts, data structures and algorithms Responsibilities:  Business logic & requirement Understanding  Excellent coding knowledge with practicing the best in industry coding standards  Basic Testing / trouble shooting skills  Identifying the scope for improvisations / optimizations & implementing  Preferred :   Prior work experience in e commerce portal development is added advantage  Must be an excellent team player  Salary:INR As per Industry standard Industr

In [None]:
jobs_vect

array([[-0.0608, -0.0617,  0.0548, ..., -0.026 , -0.0575, -0.0606],
       [-0.0615, -0.0638,  0.0519, ..., -0.0187, -0.0622, -0.0631],
       [ 0.0314, -0.0552,  0.0571, ...,  0.0127, -0.0528, -0.0543],
       ...,
       [-0.0052, -0.0537,  0.0526, ...,  0.0538, -0.0494, -0.0512],
       [-0.0504, -0.0586,  0.0583, ...,  0.014 , -0.0552, -0.0459],
       [-0.0191, -0.0396, -0.0168, ...,  0.0176, -0.0571, -0.057 ]])