In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set(style="whitegrid")

In [None]:
df=pd.read_csv('/content/naukari_analysis_data.csv',index_col=0)
df.head()

Unnamed: 0,ratings,ReviewsCount,companyName,job_post_history,URL,jobId,experience,salary,location,tagsAndSkills,title
0,3.7,2578.0,Walmart,5 Days Ago,https://www.naukri.com/job-listings-data-scientist-iii-walmart-labs-bengaluru-1-to-7-years-160425504018,160425504018,1-7 Yrs,Not disclosed,Bengaluru,"Publishing,Networking,data science,Artificial Intelligence,Machine learning,SEZ,Forecasting,Information technology",DATA SCIENTIST III
1,3.7,44273.0,Capgemini,4 Days Ago,https://www.naukri.com/job-listings-data-scientist-capgemini-technology-services-india-limited-bengaluru-4-to-7-years-180425912334,180425912334,4-7 Yrs,Not disclosed,Bengaluru,"python,software development,software design,javascript,java,c++,project management,c",Data Scientist
2,3.4,11401.0,EY,5 Days Ago,https://www.naukri.com/job-listings-data-management-data-scientist-ernst-young-bengaluru-3-to-4-years-160425507171,160425507171,3-4 Yrs,Not disclosed,Bengaluru,"Computer science,Data analysis,ERP,Assurance,Data management,Analytical,Consulting,Machine learning",Data Management - Data Scientist
3,4.2,704.0,Target,5 Days Ago,https://www.naukri.com/job-listings-sr-data-scientist-advanced-machine-learning-target-corporation-india-pvt-ltd-bengaluru-3-to-4-years-160425503989,160425503989,3-4 Yrs,Not disclosed,Bengaluru,"Supply chain,Product engineering,Data analysis,Analytical,Machine learning,Data structures,Troubleshooting,SQL",Sr Data Scientist - Advanced Machine Learning
4,3.7,55416.0,Wipro,6 Days Ago,https://www.naukri.com/job-listings-data-scientist-l3-wipro-limited-bengaluru-3-to-6-years-160425914243,160425914243,3-6 Yrs,Not disclosed,Bengaluru,"data analysis,machine learning,deep learning,data science,ml,python,natural language processing,scikit-learn",Data Scientist - L3


In [None]:
df.rename(columns={"tagsAndSkills": "skills"}, inplace=True)

In [None]:
df[df["experience"]=='0-1 Yrs']["experience"]

Unnamed: 0,experience
221,0-1 Yrs
226,0-1 Yrs
227,0-1 Yrs
231,0-1 Yrs
241,0-1 Yrs
...,...
4931,0-1 Yrs
4932,0-1 Yrs
4961,0-1 Yrs
4963,0-1 Yrs


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5184 entries, 0 to 5183
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ratings           3955 non-null   float64
 1   ReviewsCount      3955 non-null   float64
 2   companyName       5183 non-null   object 
 3   job_post_history  5184 non-null   object 
 4   URL               5184 non-null   object 
 5   jobId             5184 non-null   int64  
 6   experience        5184 non-null   object 
 7   salary            5184 non-null   object 
 8   location          5184 non-null   object 
 9   skills            5113 non-null   object 
 10  title             5184 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 486.0+ KB


In [None]:
df.describe()

Unnamed: 0,ratings,ReviewsCount,jobId
count,3955.0,3955.0,5184.0
mean,3.714943,7117.995702,156852900000.0
std,0.548582,16180.694659,79081010000.0
min,1.0,2.0,10217500000.0
25%,3.5,21.0,90425920000.0
50%,3.8,264.0,160425500000.0
75%,4.0,2578.0,220425000000.0
max,5.0,94217.0,311224500000.0


## Pre-processing

In [None]:
df.duplicated().sum()

np.int64(0)

As you can see we dont have any duplicate values in your data set.

In [None]:
missing_values=pd.DataFrame({'columns':df.columns,"missing_values":df.isnull().sum(),"%_0f_missingvalues":((df.isnull().sum()/len(df))*100)})
missing_values = missing_values.reset_index(drop=True)
missing_values.sort_values(by="%_0f_missingvalues",ascending=False)

Unnamed: 0,columns,missing_values,%_0f_missingvalues
0,ratings,1229,23.707562
1,ReviewsCount,1229,23.707562
9,skills,71,1.369599
2,companyName,1,0.01929
3,job_post_history,0,0.0
5,jobId,0,0.0
4,URL,0,0.0
6,experience,0,0.0
7,salary,0,0.0
8,location,0,0.0


Approximately 24% of the data is missing in the `ratings` and `review count` columns, while the `tags`, `skills`, and `company name` columns have very few missing values.

In [None]:
df["companyName"]=df["companyName"].fillna("Morepen Laboratories")

Imputed missing company names using the company information extracted from the job URL (e.g., via logo or HTML tags).

In [None]:
df["ratings"]=df["ratings"].fillna(df["ratings"].median())
df["ReviewsCount"]=df["ReviewsCount"].fillna(0)
df["skills"]=df["skills"].fillna("unknown")

Missing values in `ReviewsCount` are replaced with 0 to indicate no reviews, while missing values in `Skills` are replaced with "unknown" to mark unspecified skills, ensuring the dataset remains complete for analysis.

## Content-based recommended systems


One popular technique of recommendation systems is content-based filtering. Content here refers to the content or attributes of the products you like. So, the idea in content-based filtering is to tag products using certain keywords, understand what the user likes, look up those keywords in the database and recommend different products with the same attributes.

Now, let's combine the columns `companyName`, `experience`, `salary`, `skills` and `title` into a corpus and perform vectorization.

Before that we need to perform some preprocessing.

In [None]:
df1=df.copy()

In [None]:
df1["skills"]=df1["skills"].apply(lambda x: x.split(","))
df1["title"]=df1["title"].apply(lambda x: x.split(","))
df1["experience"]=df1["experience"].apply(lambda x: x.split(","))
df1["salary"]=df1["salary"].apply(lambda x: x.split(","))
df1["companyName"]=df1["companyName"].apply(lambda x: x.split(","))


In [None]:
pd.set_option('display.max_colwidth', None)
df1[["skills","title","experience","salary","companyName"]]

Unnamed: 0,skills,title,experience,salary,companyName
0,"[Publishing, Networking, data science, Artificial Intelligence, Machine learning, SEZ, Forecasting, Information technology]",[DATA SCIENTIST III],[1-7 Yrs],[Not disclosed],[Walmart]
1,"[python, software development, software design, javascript, java, c++, project management, c]",[Data Scientist],[4-7 Yrs],[Not disclosed],[Capgemini]
2,"[Computer science, Data analysis, ERP, Assurance, Data management, Analytical, Consulting, Machine learning]",[Data Management - Data Scientist],[3-4 Yrs],[Not disclosed],[EY]
3,"[Supply chain, Product engineering, Data analysis, Analytical, Machine learning, Data structures, Troubleshooting, SQL]",[Sr Data Scientist - Advanced Machine Learning],[3-4 Yrs],[Not disclosed],[Target]
4,"[data analysis, machine learning, deep learning, data science, ml, python, natural language processing, scikit-learn]",[Data Scientist - L3],[3-6 Yrs],[Not disclosed],[Wipro]
...,...,...,...,...,...
5179,"[deep learning, continuous integration, NoSQL, GCP, Machine learning, Cloud, Data processing, Natural language processing]",[ML & AI Engineer - GJT],[1-4 Yrs],[Not disclosed],[Getinz Techno Services]
5180,"[python, data analysis, hypothesis testing, natural language processing, scikit-learn, presentation skills, machine learning, artificial intelligence]",[AI-ML Lead Engineer],[3-8 Yrs],[Not disclosed],[Eximietas Design]
5181,"[python, data analysis, hypothesis testing, natural language processing, scikit-learn, presentation skills, machine learning, artificial intelligence]",[AI-ML Engineer],[2-5 Yrs],[Not disclosed],[Eximietas Design]
5182,"[Training, deep learning, NoSQL, Machine learning, power bi, Deployment, Oracle, Analytics]",[AI / ML Engineer],[2-5 Yrs],[Not disclosed],[Highpoints Technologies India]


In [None]:
df1["skills"]=df1["skills"].apply(lambda x:[i.replace(" ","") for i in x])
df1["experience"]=df1["experience"].apply(lambda x:[i.replace(" ","") for i in x])
df1["salary"]=df1["salary"].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
df1["corpus"]=df1["companyName"]+df1["experience"]+df1["salary"]+df1["skills"]+df1["title"]
new_df=df1[["jobId","title","corpus"]]
new_df.head()

Unnamed: 0,jobId,title,corpus
0,160425504018,[DATA SCIENTIST III],"[Walmart, 1-7Yrs, Notdisclosed, Publishing, Networking, datascience, ArtificialIntelligence, Machinelearning, SEZ, Forecasting, Informationtechnology, DATA SCIENTIST III]"
1,180425912334,[Data Scientist],"[Capgemini, 4-7Yrs, Notdisclosed, python, softwaredevelopment, softwaredesign, javascript, java, c++, projectmanagement, c, Data Scientist]"
2,160425507171,[Data Management - Data Scientist],"[EY, 3-4Yrs, Notdisclosed, Computerscience, Dataanalysis, ERP, Assurance, Datamanagement, Analytical, Consulting, Machinelearning, Data Management - Data Scientist]"
3,160425503989,[Sr Data Scientist - Advanced Machine Learning],"[Target, 3-4Yrs, Notdisclosed, Supplychain, Productengineering, Dataanalysis, Analytical, Machinelearning, Datastructures, Troubleshooting, SQL, Sr Data Scientist - Advanced Machine Learning]"
4,160425914243,[Data Scientist - L3],"[Wipro, 3-6Yrs, Notdisclosed, dataanalysis, machinelearning, deeplearning, datascience, ml, python, naturallanguageprocessing, scikit-learn, Data Scientist - L3]"


convert corpus into single string.

In [None]:
new_df["corpus"]=df1["corpus"].apply(lambda x:" ".join(x))

In [None]:
new_df.head()

Unnamed: 0,jobId,title,corpus
0,160425504018,[DATA SCIENTIST III],Walmart 1-7Yrs Notdisclosed Publishing Networking datascience ArtificialIntelligence Machinelearning SEZ Forecasting Informationtechnology DATA SCIENTIST III
1,180425912334,[Data Scientist],Capgemini 4-7Yrs Notdisclosed python softwaredevelopment softwaredesign javascript java c++ projectmanagement c Data Scientist
2,160425507171,[Data Management - Data Scientist],EY 3-4Yrs Notdisclosed Computerscience Dataanalysis ERP Assurance Datamanagement Analytical Consulting Machinelearning Data Management - Data Scientist
3,160425503989,[Sr Data Scientist - Advanced Machine Learning],Target 3-4Yrs Notdisclosed Supplychain Productengineering Dataanalysis Analytical Machinelearning Datastructures Troubleshooting SQL Sr Data Scientist - Advanced Machine Learning
4,160425914243,[Data Scientist - L3],Wipro 3-6Yrs Notdisclosed dataanalysis machinelearning deeplearning datascience ml python naturallanguageprocessing scikit-learn Data Scientist - L3


## Tokenization



In [None]:
!pip install nltk
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize



In [None]:
def tokenize_text(text):
    # Tokenize the text
    words = word_tokenize(text)

    return words

In [None]:
new_df["corpus"] = new_df["corpus"].apply(word_tokenize)
new_df["corpus"].head()

Unnamed: 0,corpus
0,"[Walmart, 1-7Yrs, Notdisclosed, Publishing, Networking, datascience, ArtificialIntelligence, Machinelearning, SEZ, Forecasting, Informationtechnology, DATA, SCIENTIST, III]"
1,"[Capgemini, 4-7Yrs, Notdisclosed, python, softwaredevelopment, softwaredesign, javascript, java, c++, projectmanagement, c, Data, Scientist]"
2,"[EY, 3-4Yrs, Notdisclosed, Computerscience, Dataanalysis, ERP, Assurance, Datamanagement, Analytical, Consulting, Machinelearning, Data, Management, -, Data, Scientist]"
3,"[Target, 3-4Yrs, Notdisclosed, Supplychain, Productengineering, Dataanalysis, Analytical, Machinelearning, Datastructures, Troubleshooting, SQL, Sr, Data, Scientist, -, Advanced, Machine, Learning]"
4,"[Wipro, 3-6Yrs, Notdisclosed, dataanalysis, machinelearning, deeplearning, datascience, ml, python, naturallanguageprocessing, scikit-learn, Data, Scientist, -, L3]"


In [None]:
# convert corpus into single string.
new_df["corpus"]=new_df["corpus"].apply(lambda x:" ".join(x))

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english',lowercase=True,max_features=2500,strip_accents='unicode')

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(new_df['corpus'])

In [None]:
tfidf_matrix.shape

(5184, 2500)

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out()
feature_names

array(['000', '000pa', '0yrs', ..., 'zessta', 'zeta', 'zetwerk'],
      dtype=object)

### Cosine_Similarity

In [None]:

from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

cosine_sim[0]

array([1.        , 0.09929749, 0.07375612, ..., 0.05909898, 0.02542797,
       0.02270232])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure you have the same TF-IDF vectorizer and matrix used during training
vectorizer = tfidf_vectorizer  # TF-IDF vectorizer used earlier

def get_recommendations(title="", location="", experience="", skills="", salary="", top_n=5):
    # Combine user inputs into a single string like in the dataset
    input_combined = f"{title} {location} {experience} {skills} {salary}".lower().strip()

    # Vectorize the user input using the same TF-IDF vectorizer
    input_vec = vectorizer.transform([input_combined])

    # Compute cosine similarity between the input and all job listings
    similarity_scores = cosine_similarity(input_vec, tfidf_matrix).flatten()

    # Get indices of the top N most similar jobs
    top_indices = similarity_scores.argsort()[-top_n:][::-1]

    # Create result DataFrame and add similarity score column
    result_df = df.iloc[top_indices].copy()
    result_df['similarity_score'] = similarity_scores[top_indices]

    # Return selected columns with similarity score
    return result_df[['title', 'companyName', 'ratings', 'experience', 'salary', 'location','job_post_history', 'URL', 'skills', 'similarity_score']]


In [None]:
recommendations=get_recommendations(title="data analyst", location="hyderabad", experience="0-1Yrs", skills="", salary="", top_n=5)
recommendations

Unnamed: 0,title,companyName,ratings,experience,salary,location,job_post_history,URL,skills,similarity_score
2288,Business Analyst,Radicalstart Infolab,3.9,0-1 Yrs,Not disclosed,Madurai,30+ Days Ago,https://www.naukri.com/job-listings-business-analyst-radicalstart-infolab-madurai-0-to-1-years-040325502761,"Business Analyst,Business analysis",0.37907
2268,Business Analyst,Aliengena Captive,2.7,0-1 Yrs,Not disclosed,Pune,30+ Days Ago,https://www.naukri.com/job-listings-business-analyst-aliengena-captive-private-limited-pune-0-to-1-years-190624501048,"Business Analyst,Business analysis",0.37907
612,Data Scientist,Ancla Consultancy Services India,3.5,6-12 Yrs,Not disclosed,Hyderabad,30+ Days Ago,https://www.naukri.com/job-listings-data-scientist-ancla-consultancy-services-india-pvt-ltd-hyderabad-6-to-12-years-080420500680,"Location: Hyderabad,Data Scientist,Experience: 6-12 yrs",0.338903
4020,Software Engineer,Icare Academy Hyderabad,1.7,0-2 Yrs,Not disclosed,Hyderabad,30+ Days Ago,https://www.naukri.com/job-listings-software-engineer-icare-academy-hyderabad-0-to-2-years-031224506471,"Telecom,Automation,Software QA,Software development,Focus,Agile,Python,Software engineering",0.31143
1255,Data Analyst - Hyderabad - F2F - 5th April - Interview,People Staffing,3.8,8-13 Yrs,8-18 Lacs PA,Hyderabad,19 Days Ago,https://www.naukri.com/job-listings-data-analyst-hyderabad-f2f-5th-april-interview-people-staffing-hyderabad-8-to-13-years-030425005834,"Data Analysis,Data Visualization,Advance Sql,Data Analytics,Python,Business Intelligence,Google Analytics,Power Bi",0.293579


In [None]:
recommendations["location"].value_counts().head(10)

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Hyderabad,3
Madurai,1
Pune,1


In [None]:
'''import pickle

pickle.dump(df.to_dict(),open('df_dict.pkl','wb'))
pickle.dump(cosine_sim,open('cosine_sim.pkl','wb'))
pickle.dump(tfidf_vectorizer,open('tfidf_vectorizer.pkl','wb'))
pickle.dump(tfidf_matrix,open('tfidf_matrix.pkl','wb'))
'''
