<a href="https://colab.research.google.com/github/Prince125047/job_recommendation/blob/main/dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load dataset (if not loaded already)
df = pd.read_csv("jobs.csv")

# Display basic info
df.info()  # Check column names, data types, and missing values
df.head()  # Show first few rows
df.describe()  # Summary statistics


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27010 entries, 0 to 27009
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               27010 non-null  int64 
 1   Job Salary               27010 non-null  object
 2   Job Experience Required  27010 non-null  object
 3   Key Skills               27010 non-null  object
 4   Role Category            27010 non-null  object
 5   Functional Area          27010 non-null  object
 6   Industry                 27010 non-null  object
 7   Job Title                27010 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.6+ MB


Unnamed: 0.1,Unnamed: 0
count,27010.0
mean,14973.19726
std,8661.925267
min,0.0
25%,7474.25
50%,14913.5
75%,22476.5
max,29999.0


In [2]:
print(df.isnull().sum())  # Double-check missing values
print(df.head())  # Preview data to see inconsistencies

Unnamed: 0                 0
Job Salary                 0
Job Experience Required    0
Key Skills                 0
Role Category              0
Functional Area            0
Industry                   0
Job Title                  0
dtype: int64
   Unnamed: 0                    Job Salary Job Experience Required  \
0           0   Not Disclosed by Recruiter               5 - 10 yrs   
1           1   Not Disclosed by Recruiter                2 - 5 yrs   
2           2   Not Disclosed by Recruiter                0 - 1 yrs   
3           3       2,00,000 - 4,00,000 PA.               0 - 5 yrs   
4           4   Not Disclosed by Recruiter                2 - 5 yrs   

                                          Key Skills  \
0                      Media Planning| Digital Media   
1   pre sales| closing| software knowledge| clien...   
2   Computer science| Fabrication| Quality check|...   
3                                  Technical Support   
4   manual testing| test engineering| test cases

In [3]:
import re

def extract_salary(salary):
    match = re.findall(r"\d+", salary)  # Extract numeric values
    return (int(match[0]) + int(match[-1])) / 2 if match else None  # Take avg if range given

df["Job Salary"] = df["Job Salary"].apply(lambda x: extract_salary(str(x)))
df = df.dropna(subset=["Job Salary"])  # Drop rows where salary extraction failed


In [4]:
df["Key Skills"] = df["Key Skills"].apply(lambda x: x.split("|") if isinstance(x, str) else [])

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["Role Category Encoded"] = le.fit_transform(df["Role Category"])
df["Functional Area Encoded"] = le.fit_transform(df["Functional Area"])
df["Industry Encoded"] = le.fit_transform(df["Industry"])


In [6]:
from collections import Counter

all_skills = [skill for sublist in df["Key Skills"] for skill in sublist]
skill_counts = Counter(all_skills)
print(skill_counts.most_common(10))  # Top 10 most required skills


[(' sales', 331), (' bpo', 327), (' Sales', 293), (' international bpo', 287), (' customer service', 277), (' voice process', 268), (' fresher', 245), (' business development', 242), (' Communication Skills', 211), (' Javascript', 199)]


In [7]:
print(df["Role Category"].value_counts())
print(df["Industry"].value_counts())

Role Category
Programming & Design                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [12]:
import pandas as pd

# Load dataset
df = pd.read_csv("jobs.csv")

# Creating Resume Parsing Dataset
resume_data = df[['Job Title', 'Key Skills']]
resume_data.to_csv('resume_data.csv', index=False)

# Creating Job Recommendation Dataset
job_recommendation_data = df[['Job Title', 'Role Category', 'Functional Area', 'Industry']]
job_recommendation_data.to_csv('job_recommendation_data.csv', index=False)

# Creating Skill Gap Analysis Dataset
skill_gap_data = df[['Job Title', 'Key Skills', 'Job Experience Required']]
skill_gap_data.to_csv('skill_gap_data.csv', index=False)

print("Datasets saved successfully! 🎯")

Datasets saved successfully! 🎯


In [13]:
import os
print(os.listdir())  # Lists all files in the current directory

['.config', '.ipynb_checkpoints', 'jobs.csv', 'skill_gap_data.csv', 'job_recommendation_data.csv', 'resume_data.csv', 'sample_data']


In [14]:
from google.colab import files
files.download("resume_data.csv")
files.download("job_recommendation_data.csv")
files.download("skill_gap_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>