In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
dataset = pd.read_csv("Data/cleaned_df.csv")

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1610462 entries, 0 to 1610461
Data columns (total 20 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Qualifications      1610462 non-null  object 
 1   location            1610462 non-null  object 
 2   Country             1610462 non-null  object 
 3   Work Type           1610462 non-null  object 
 4   Company Size        1610462 non-null  int64  
 5   Job Posting Date    1610462 non-null  object 
 6   Preference          1610462 non-null  object 
 7   Job Title           1610462 non-null  object 
 8   Role                1610462 non-null  object 
 9   Job Description     1610462 non-null  object 
 10  skills              1610462 non-null  object 
 11  Responsibilities    1610462 non-null  object 
 12  Company             1610462 non-null  object 
 13  Company Profile     1610462 non-null  object 
 14  min_experience      1610462 non-null  int64  
 15  max_experience 

In [5]:
for col in dataset.columns:
    if dataset[col].dtype == "object":
        print(f"{col}: {len(dataset[col].value_counts().keys())}")

Qualifications: 10
location: 214
Country: 216
Work Type: 5
Job Posting Date: 731
Preference: 3
Job Title: 147
Role: 376
Job Description: 376
skills: 376
Responsibilities: 375
Company: 885
Company Profile: 884


In [6]:
dataset["Qualifications"].unique()

array(['M.Tech', 'BCA', 'PhD', 'MBA', 'MCA', 'M.Com', 'BBA', 'B.Tech',
       'B.Com', 'BA'], dtype=object)

In [7]:
bachelor = ["BCA", "BBA", "B.Tech", "B.Com", "BA"]
masters = ["MBA", "MCA", "M.Com", "M.Tech"]
phd = ["PhD"]

def map_qualification_to_level(qualifications):
    if qualifications in bachelor:
        return "Bachelor"
    elif qualifications in masters:
        return "Master"
    else:
        return "PhD"

In [8]:
dataset["Level"] = dataset["Qualifications"].apply(map_qualification_to_level)

dataset[["Level", "Qualifications"]].head()

Unnamed: 0,Level,Qualifications
0,Master,M.Tech
1,Bachelor,BCA
2,PhD,PhD
3,PhD,PhD
4,Master,MBA


In [9]:
bins = [0, 10000, 50000, 100000, 10000000]
labels = ['small', 'medium', 'large', 'enterprise']

dataset["Company_size"] = pd.cut(dataset["Company Size"], bins = bins, labels = labels, right = True, include_lowest = True)

In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1610462 entries, 0 to 1610461
Data columns (total 22 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   Qualifications      1610462 non-null  object  
 1   location            1610462 non-null  object  
 2   Country             1610462 non-null  object  
 3   Work Type           1610462 non-null  object  
 4   Company Size        1610462 non-null  int64   
 5   Job Posting Date    1610462 non-null  object  
 6   Preference          1610462 non-null  object  
 7   Job Title           1610462 non-null  object  
 8   Role                1610462 non-null  object  
 9   Job Description     1610462 non-null  object  
 10  skills              1610462 non-null  object  
 11  Responsibilities    1610462 non-null  object  
 12  Company             1610462 non-null  object  
 13  Company Profile     1610462 non-null  object  
 14  min_experience      1610462 non-null  int64   
 15

In [11]:
dataset.columns

Index(['Qualifications', 'location', 'Country', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Job Title', 'Role',
       'Job Description', 'skills', 'Responsibilities', 'Company',
       'Company Profile', 'min_experience', 'max_experience',
       'average_experience', 'min_salary', 'max_salary', 'average_salary',
       'Level', 'Company_size'],
      dtype='object')

In [12]:
dataset[['Qualifications', 'location', 'Country', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Job Title', 'Role',
       'Job Description', 'skills']].head()

Unnamed: 0,Qualifications,location,Country,Work Type,Company Size,Job Posting Date,Preference,Job Title,Role,Job Description,skills
0,M.Tech,Douglas,Isle of Man,Intern,26801,2022-04-24,Female,Digital Marketing Specialist,Social Media Manager,Social Media Managers oversee an organizations...,"Social media platforms (e.g., Facebook, Twitte..."
1,BCA,Ashgabat,Turkmenistan,Intern,100340,2022-12-19,Female,Web Developer,Frontend Web Developer,Frontend Web Developers design and implement u...,"HTML, CSS, JavaScript Frontend frameworks (e.g..."
2,PhD,Macao,"Macao SAR, China",Temporary,84525,2022-09-14,Male,Operations Manager,Quality Control Manager,Quality Control Managers establish and enforce...,Quality control processes and methodologies St...
3,PhD,Porto-Novo,Benin,Full-Time,129896,2023-02-25,Female,Network Engineer,Wireless Network Engineer,"Wireless Network Engineers design, implement, ...",Wireless network design and architecture Wi-Fi...
4,MBA,Santiago,Chile,Intern,53944,2022-10-11,Female,Event Manager,Conference Manager,A Conference Manager coordinates and manages c...,Event planning Conference logistics Budget man...


In [13]:
drop_cols = ["location", "Country", "Job Posting Date", "Role", "Job Description", "Company", "Company Profile"]

In [14]:
dataset.drop(drop_cols, axis = 1, inplace = True)

In [15]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1610462 entries, 0 to 1610461
Data columns (total 15 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   Qualifications      1610462 non-null  object  
 1   Work Type           1610462 non-null  object  
 2   Company Size        1610462 non-null  int64   
 3   Preference          1610462 non-null  object  
 4   Job Title           1610462 non-null  object  
 5   skills              1610462 non-null  object  
 6   Responsibilities    1610462 non-null  object  
 7   min_experience      1610462 non-null  int64   
 8   max_experience      1610462 non-null  int64   
 9   average_experience  1610462 non-null  float64 
 10  min_salary          1610462 non-null  int64   
 11  max_salary          1610462 non-null  int64   
 12  average_salary      1610462 non-null  float64 
 13  Level               1610462 non-null  object  
 14  Company_size        1610462 non-null  category
dty

In [16]:
dataset.head()

Unnamed: 0,Qualifications,Work Type,Company Size,Preference,Job Title,skills,Responsibilities,min_experience,max_experience,average_experience,min_salary,max_salary,average_salary,Level,Company_size
0,M.Tech,Intern,26801,Female,Digital Marketing Specialist,"Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",5,15,10.0,59000,99000,79000.0,Master,medium
1,BCA,Intern,100340,Female,Web Developer,"HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",2,12,7.0,56000,116000,86000.0,Bachelor,enterprise
2,PhD,Temporary,84525,Male,Operations Manager,Quality control processes and methodologies St...,Establish and enforce quality control standard...,0,12,6.0,61000,104000,82500.0,PhD,large
3,PhD,Full-Time,129896,Female,Network Engineer,Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",4,11,7.5,65000,91000,78000.0,PhD,enterprise
4,MBA,Intern,53944,Female,Event Manager,Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,1,12,6.5,64000,87000,75500.0,Master,large


In [17]:
import re

In [18]:
def clean_and_format_skills(skills):
    # Remove text within parentheses and the parentheses themselves
    cleaned_skills = re.sub(r'\(.*?\)', '', skills)
    # Remove extra spaces
    cleaned_skills = re.sub(r'\s+', ' ', cleaned_skills).strip()
    # Replace "and" with comma
    cleaned_skills = re.sub(r'\band\b', ',', cleaned_skills)
    return cleaned_skills

# Apply the function to the 'skills' column
dataset['skills'] = dataset['skills'].apply(clean_and_format_skills)

# Function to separate skills into a list by commas
def separate_skills(skills):
    return [skill.strip() for skill in skills.split(',') if skill.strip()]

# Apply the function to separate skills
dataset['skills'] = dataset['skills'].apply(separate_skills)

In [22]:
all_skills = list(set(skill for sublist in dataset['skills'] for skill in sublist))
skills_df = pd.DataFrame([[skill in sublist for skill in all_skills] for sublist in dataset['skills']], columns=all_skills)

# Merge with original data
data = pd.concat([dataset, skills_df], axis=1)

In [25]:
data.shape

(1610462, 706)

In [None]:
len(dataset["skills"].value_counts().keys())

376

In [26]:
data.to_csv("Data/data_after_feature_eng_1.csv", index = False)