In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
dataset = pd.read_csv("Data/cleaned_df.csv")

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696012 entries, 0 to 696011
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Qualifications      696012 non-null  object 
 1   location            696012 non-null  object 
 2   Country             696012 non-null  object 
 3   Work Type           696012 non-null  object 
 4   Company Size        696012 non-null  int64  
 5   Job Posting Date    696012 non-null  object 
 6   Preference          696012 non-null  object 
 7   Job Title           696012 non-null  object 
 8   Role                696012 non-null  object 
 9   Job Description     696012 non-null  object 
 10  skills              696012 non-null  object 
 11  Responsibilities    696012 non-null  object 
 12  Company             696012 non-null  object 
 13  Company Profile     696012 non-null  object 
 14  min_experience      696012 non-null  int64  
 15  max_experience      696012 non-nul

In [5]:
for col in dataset.columns:
    if dataset[col].dtype == "object":
        print(f"{col}: {len(dataset[col].value_counts().keys())}")

Qualifications: 10
location: 214
Country: 216
Work Type: 5
Job Posting Date: 731
Preference: 3
Job Title: 147
Role: 376
Job Description: 376
skills: 376
Responsibilities: 375
Company: 885
Company Profile: 884


In [6]:
dataset["Qualifications"].unique()

array(['B.Tech', 'BA', 'MBA', 'MCA', 'M.Com', 'BCA', 'B.Com', 'PhD',
       'BBA', 'M.Tech'], dtype=object)

In [7]:
bachelor = ["BCA", "BBA", "B.Tech", "B.Com", "BA"]
masters = ["MBA", "MCA", "M.Com", "M.Tech"]
phd = ["PhD"]

def map_qualification_to_level(qualifications):
    if qualifications in bachelor:
        return "Bachelor"
    elif qualifications in masters:
        return "Master"
    else:
        return "PhD"

In [8]:
dataset["Level"] = dataset["Qualifications"].apply(map_qualification_to_level)

dataset[["Level", "Qualifications"]].head()

Unnamed: 0,Level,Qualifications
0,Bachelor,B.Tech
1,Bachelor,BA
2,Bachelor,B.Tech
3,Bachelor,B.Tech
4,Master,MBA


In [9]:
bins = [0, 20000, 50000, 100000]
labels = ['small', 'medium', 'large']

dataset["Company_size"] = pd.cut(dataset["Company Size"], bins = bins, labels = labels, right = True, include_lowest = True)

In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696012 entries, 0 to 696011
Data columns (total 22 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   Qualifications      696012 non-null  object  
 1   location            696012 non-null  object  
 2   Country             696012 non-null  object  
 3   Work Type           696012 non-null  object  
 4   Company Size        696012 non-null  int64   
 5   Job Posting Date    696012 non-null  object  
 6   Preference          696012 non-null  object  
 7   Job Title           696012 non-null  object  
 8   Role                696012 non-null  object  
 9   Job Description     696012 non-null  object  
 10  skills              696012 non-null  object  
 11  Responsibilities    696012 non-null  object  
 12  Company             696012 non-null  object  
 13  Company Profile     696012 non-null  object  
 14  min_experience      696012 non-null  int64   
 15  max_experience   

In [11]:
dataset.columns

Index(['Qualifications', 'location', 'Country', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Job Title', 'Role',
       'Job Description', 'skills', 'Responsibilities', 'Company',
       'Company Profile', 'min_experience', 'max_experience',
       'average_experience', 'min_salary', 'max_salary', 'average_salary',
       'Level', 'Company_size'],
      dtype='object')

In [12]:
dataset[['Qualifications', 'location', 'Country', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Job Title', 'Role',
       'Job Description', 'skills']].head()

Unnamed: 0,Qualifications,location,Country,Work Type,Company Size,Job Posting Date,Preference,Job Title,Role,Job Description,skills
0,B.Tech,Bern,Switzerland,Part-Time,85230,2023-01-11,Female,Project Coordinator,Construction Project Coordinator,Construction Project Coordinators assist in ma...,"['Construction', 'project', 'management', 'Bui..."
1,BA,Stockholm,Sweden,Contract,129985,2022-01-18,Both,Procurement Specialist,Procurement Coordinator,Procurement Coordinators assist in procurement...,"['Procurement', 'processes', 'Purchase', 'orde..."
2,B.Tech,Male,Maldives,Contract,24650,2022-04-08,Male,Software Engineer,Frontend Developer,A Frontend Developer is responsible for design...,"['Proficiency', 'HTML', 'CSS', 'JavaScript', '..."
3,B.Tech,New Delhi,India,Intern,52871,2022-02-13,Female,Marketing Specialist,Marketing Analytics Specialist,Marketing Analytics Specialists analyze market...,"['Marketing', 'analytics', 'Data', 'analysis',..."
4,MBA,Managua,Nicaragua,Full-Time,96905,2021-12-13,Female,Finance Manager,Accounting Manager,Accounting Managers supervise accounting teams...,"['Accounting', 'principles', 'Financial', 'rep..."


In [13]:
drop_cols = ["location", "Country", "Job Posting Date", "Role", "Job Description", "Company", "Company Profile"]

In [14]:
dataset.drop(drop_cols, axis = 1, inplace = True)

In [15]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696012 entries, 0 to 696011
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   Qualifications      696012 non-null  object  
 1   Work Type           696012 non-null  object  
 2   Company Size        696012 non-null  int64   
 3   Preference          696012 non-null  object  
 4   Job Title           696012 non-null  object  
 5   skills              696012 non-null  object  
 6   Responsibilities    696012 non-null  object  
 7   min_experience      696012 non-null  int64   
 8   max_experience      696012 non-null  int64   
 9   average_experience  696012 non-null  float64 
 10  min_salary          696012 non-null  int64   
 11  max_salary          696012 non-null  int64   
 12  average_salary      696012 non-null  float64 
 13  Level               696012 non-null  object  
 14  Company_size        497769 non-null  category
dtypes: category(1), f

In [16]:
dataset.head()

Unnamed: 0,Qualifications,Work Type,Company Size,Preference,Job Title,skills,Responsibilities,min_experience,max_experience,average_experience,min_salary,max_salary,average_salary,Level,Company_size
0,B.Tech,Part-Time,85230,Female,Project Coordinator,"['Construction', 'project', 'management', 'Bui...","Coordinate construction projects, including pe...",1,9,5.0,55000,96000,75500.0,Bachelor,large
1,BA,Contract,129985,Both,Procurement Specialist,"['Procurement', 'processes', 'Purchase', 'orde...","Support procurement activities, such as purcha...",1,13,7.0,55000,92000,73500.0,Bachelor,
2,B.Tech,Contract,24650,Male,Software Engineer,"['Proficiency', 'HTML', 'CSS', 'JavaScript', '...",Design and implement user interfaces for web a...,3,11,7.0,55000,127000,91000.0,Bachelor,medium
3,B.Tech,Intern,52871,Female,Marketing Specialist,"['Marketing', 'analytics', 'Data', 'analysis',...",Analyze marketing data and generate insights t...,4,10,7.0,58000,85000,71500.0,Bachelor,large
4,MBA,Full-Time,96905,Female,Finance Manager,"['Accounting', 'principles', 'Financial', 'rep...","Manage accounting functions, including financi...",3,10,6.5,62000,93000,77500.0,Master,large


In [17]:
# data = pd.get_dummies(dataset, columns = ["skills"])

In [18]:
dataset.shape

(696012, 15)

In [19]:
dataset.shape

(696012, 15)

In [20]:
# Normalize and tokenize the skills
dataset['skills'] = dataset['skills'].apply(lambda x: [skill.lower().strip() for skill in x.split()])

In [21]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

In [22]:
glove_embeddings = load_glove_embeddings('Data/glove.6B.100d.txt')

In [23]:
def get_skill_embedding(skill, embeddings_index, embedding_dim=100):
    return embeddings_index.get(skill, np.zeros(embedding_dim))

# Create embeddings for each skill
dataset['skill_embeddings'] = dataset['skills'].apply(lambda skills: 
    [get_skill_embedding(skill, glove_embeddings) for skill in skills])

In [24]:
def aggregate_embeddings(skill_embeddings):
    if len(skill_embeddings) == 0:
        return np.zeros(len(skill_embeddings[0]))
    return np.mean(skill_embeddings, axis=0)

In [25]:
# Aggregate embeddings for each job posting
dataset['aggregated_skill_embeddings'] = dataset['skill_embeddings'].apply(aggregate_embeddings)

# Convert the aggregated skill embeddings into a matrix
embeddings_matrix = np.vstack(dataset['aggregated_skill_embeddings'].values)

# Display the shape of the embeddings matrix
print(embeddings_matrix.shape)

(696012, 100)


In [26]:
from sklearn.decomposition import PCA

In [27]:
pca = PCA(n_components=40)
reduced_data = pca.fit_transform(embeddings_matrix)

# Convert the reduced NumPy array to a DataFrame
reduced_df = pd.DataFrame(reduced_data, columns=[f'feature_{i+1}' for i in range(reduced_data.shape[1])])

# Concatenate the original DataFrame with the reduced DataFrame
merged_df = pd.concat([dataset, reduced_df], axis=1)

print(merged_df.shape)

  self.explained_variance_ratio_ = self.explained_variance_ / total_var


(696012, 57)


In [28]:
merged_df.columns

Index(['Qualifications', 'Work Type', 'Company Size', 'Preference',
       'Job Title', 'skills', 'Responsibilities', 'min_experience',
       'max_experience', 'average_experience', 'min_salary', 'max_salary',
       'average_salary', 'Level', 'Company_size', 'skill_embeddings',
       'aggregated_skill_embeddings', 'feature_1', 'feature_2', 'feature_3',
       'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8',
       'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13',
       'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18',
       'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23',
       'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28',
       'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33',
       'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38',
       'feature_39', 'feature_40'],
      dtype='object')

In [29]:
dataset = merged_df.copy()

In [30]:
cols = ["Qualifications", "Work Type", "Company Size", "Preference", "Job Title",
        "min_experience","max_experience", "average_experience", "min_salary", 
        "max_salary","average_salary", "Level", "Company_size",
        "feature_1", "feature_2", "feature_3",
        "feature_4", "feature_5", "feature_6", "feature_7", "feature_8",
        "feature_9", "feature_10", "feature_11", "feature_12", "feature_13",
        "feature_14", "feature_15", "feature_16", "feature_17", "feature_18",
        "feature_19", "feature_20", "feature_21", "feature_22", "feature_23",
        "feature_24", "feature_25", "feature_26", "feature_27", "feature_28",
        "feature_29", "feature_30", "feature_31", "feature_32", "feature_33",
        "feature_34", "feature_35", "feature_36", "feature_37", "feature_38",
        "feature_39", "feature_40"]

dataset = dataset[cols]

In [31]:
dataset.head()

Unnamed: 0,Qualifications,Work Type,Company Size,Preference,Job Title,min_experience,max_experience,average_experience,min_salary,max_salary,...,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40
0,B.Tech,Part-Time,85230,Female,Project Coordinator,1,9,5.0,55000,96000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BA,Contract,129985,Both,Procurement Specialist,1,13,7.0,55000,92000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B.Tech,Contract,24650,Male,Software Engineer,3,11,7.0,55000,127000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,B.Tech,Intern,52871,Female,Marketing Specialist,4,10,7.0,58000,85000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MBA,Full-Time,96905,Female,Finance Manager,3,10,6.5,62000,93000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [33]:
# Apply one-hot encoding
one_hot = pd.get_dummies(dataset[["Qualifications", "Work Type", "Preference"]])

# Apply label encoding to other categorical columns
label_encoders = {}
for col in ["Company Size", "Level", "Company_size"]:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col].astype(str))
    label_encoders[col] = le

# Combine one-hot encoded columns with the label encoded columns
df_encoded = pd.concat([dataset.drop(columns=["Qualifications", "Work Type", "Preference"]), one_hot], axis=1)

print(df_encoded.shape)

(696012, 68)


In [34]:
df_encoded.to_csv("Data/data_after_feature_eng.csv", index = False)