# 🔧 Phase 2 – Feature Engineering
**Objective:**  
Transform unstructured text data into structured numerical features for employee-project matching.

---

## ✅ Steps:
1. Import Libraries & Load Cleaned Data  
2. Build Skills Vocabulary  
3. Extract Employee Skills  
4. Extract Project Skills  
5. Vectorize (Binary + TF-IDF)  
6. Export Feature Matrices


In [141]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# File paths
EMP_PATH = "../data/processed/employee_master_cleaned.csv"
EXP_PATH = "../data/processed/employee_experience_cleaned.csv"
PROJ_PATH = "../data/processed/client_projects_cleaned.csv"

# Load data
df_emp = pd.read_csv(EMP_PATH)
df_exp = pd.read_csv(EXP_PATH)
df_proj = pd.read_csv(PROJ_PATH)

print("Data Loaded:")
print(df_emp.shape, df_exp.shape, df_proj.shape)


Data Loaded:
(1000, 4) (1000, 2) (100, 6)


In [142]:
df_emp.head()

Unnamed: 0,Employee_ID,Department,Years_Experience,Location
0,E101,AI Research,14.5,Pune
1,E102,DevOps,2.2,Pune
2,E103,Full Stack Dev,3.9,Mumbai
3,E104,AI Research,8.3,Chennai
4,E105,Data Science,6.0,Mumbai


In [143]:
# Extract unique skills from project Required_Skills
project_skills_set = set()
df_proj["Required_Skills"].dropna().apply(lambda x: project_skills_set.update([s.strip().lower() for s in x.split(",")]))

# Extract potential skills from employee Experience_Text (using simple tokenization)
employee_skills_set = set()
for text in df_exp["Experience_Text"].dropna():
    tokens = re.findall(r"[a-zA-Z0-9\+\#\.\-]+", text.lower())  # keep tech terms like .net, c++, etc.
    employee_skills_set.update(tokens)

# Merge both sets as our dynamic skills vocabulary
skills_vocab = sorted(project_skills_set.union(employee_skills_set))
print(f"✅ Total unique skills extracted: {len(skills_vocab)}")


✅ Total unique skills extracted: 176


In [144]:
len(project_skills_set), len(employee_skills_set), len(skills_vocab)

(58, 140, 176)

In [145]:
## . Extract Skills per Employee & Project
def extract_skills_dynamic(text, skills_set):
    text = text.lower()
    return [skill for skill in skills_set if skill in text]

df_exp["Extracted_Skills"] = df_exp["Experience_Text"].apply(lambda x: extract_skills_dynamic(str(x), skills_vocab))
df_proj["Skill_List"] = df_proj["Required_Skills"].apply(lambda x: [s.strip().lower() for s in str(x).split(",")])


In [146]:
df_exp.head()

Unnamed: 0,Employee_ID,Experience_Text,Extracted_Skills
0,E101,"worked on projects involving azure, blockchain...","[and, azure, blockchain, delivered, domain., f..."
1,E102,"worked on projects involving rest apis, data e...","[and, api, apis, data, data engineering, deliv..."
2,E103,worked on projects involving data visualizatio...,"[and, data, data visualization, delivered, des..."
3,E104,"worked on projects involving graphql, agile, a...","[agile, and, delivered, domain., for, graphql,..."
4,E105,"worked on projects involving java, azure, and ...","[and, azure, data, data lake implementation, d..."


In [147]:
df_proj.head()

Unnamed: 0,Project_ID,Client_Name,Project_Description,Required_Skills,Location,Status,Skill_List
0,P301,Byrd Ltd,looking for expertise in sql database manageme...,"docker, nosql, sql database management",Delhi,Closed,"[docker, nosql, sql database management]"
1,P302,"Cunningham, Anderson and Fernandez","looking for expertise in cloud security, conta...","cloud security, container orchestration, python",Pune,Open,"[cloud security, container orchestration, python]"
2,P303,"Mata, Mclean and Jones","looking for expertise in react, mobile develop...","microservices architecture, mobile development...",Mumbai,Closed,"[microservices architecture, mobile developmen..."
3,P304,Rodriguez-Dominguez,"looking for expertise in edge computing, jenki...","edge computing, iot, jenkins",Chennai,Closed,"[edge computing, iot, jenkins]"
4,P305,Medina-Maldonado,"looking for expertise in terraform, sql databa...","nlp, sql database management, terraform",Mumbai,Open,"[nlp, sql database management, terraform]"


In [148]:
import nltk
from nltk.corpus import stopwords

# Download stopwords once
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

skills_vocab = [skill for skill in skills_vocab if skill not in stop_words]



[nltk_data] Downloading package stopwords to C:\Users\Suraj
[nltk_data]     Khodade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [149]:
len(skills_vocab)

173

In [150]:
import re

def clean_skill_name(skill):
    # Remove trailing dots, commas, and extra whitespace
    cleaned = skill.rstrip('.').rstrip(',').strip()
    # Optional: remove other punctuation if needed
    cleaned = re.sub(r'[^\w\s\-+]', '', cleaned)  # keep letters, digits, hyphen, plus sign
    return cleaned.lower()

skills_vocab_cleaned = [clean_skill_name(skill) for skill in skills_vocab]

# If you want unique and sorted skills after cleaning:
skills_vocab_cleaned = sorted(set(skills_vocab_cleaned))

In [151]:
# Create binary skill matrix for employees
employee_matrix = pd.DataFrame(0, index=df_exp["Employee_ID"], columns=skills_vocab_cleaned)
for idx, row in df_exp.iterrows():
    for skill in row["Extracted_Skills"]:
        if skill in employee_matrix.columns:
            employee_matrix.loc[row["Employee_ID"], skill] = 1

# Create binary skill matrix for projects
project_matrix = pd.DataFrame(0, index=df_proj["Project_ID"], columns=skills_vocab_cleaned)
for idx, row in df_proj.iterrows():
    for skill in row["Skill_List"]:
        if skill in project_matrix.columns:
            project_matrix.loc[row["Project_ID"], skill] = 1



In [152]:
employee_matrix.head()

Unnamed: 0_level_0,agile,agile project management,analytics,api,api development,apis,architecture,automation,aws,azure,...,telecom,tensorflow,terraform,testing,ui,uiux design,ux,visualization,warehousing,worked
Employee_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E101,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
E102,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
E103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,1,0,1
E104,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
E105,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [153]:
project_matrix.head()

Unnamed: 0_level_0,agile,agile project management,analytics,api,api development,apis,architecture,automation,aws,azure,...,telecom,tensorflow,terraform,testing,ui,uiux design,ux,visualization,warehousing,worked
Project_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P301,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P302,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P303,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P304,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P305,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [154]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use the cleaned skills vocabulary (skills_vocab_cleaned) if you have it,
# otherwise use your current skills_vocab list

# Instantiate TF-IDF vectorizer with predefined vocabulary
vectorizer = TfidfVectorizer(vocabulary=skills_vocab_cleaned, lowercase=True)

# Fit-transform the Experience_Text column
tfidf_matrix = vectorizer.fit_transform(df_exp["Experience_Text"].fillna(""))

# Convert to DataFrame for easy manipulation
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=df_exp["Employee_ID"], columns=vectorizer.get_feature_names_out())

print("TF-IDF matrix shape:", tfidf_df.shape)
print(tfidf_df.head())


TF-IDF matrix shape: (1000, 124)
               agile  agile project management  analytics  api  \
Employee_ID                                                      
E101         0.00000                       0.0        0.0  0.0   
E102         0.00000                       0.0        0.0  0.0   
E103         0.00000                       0.0        0.0  0.0   
E104         0.48381                       0.0        0.0  0.0   
E105         0.00000                       0.0        0.0  0.0   

             api development      apis  architecture  automation  aws  \
Employee_ID                                                             
E101                     0.0  0.000000           0.0         0.0  0.0   
E102                     0.0  0.404194           0.0         0.0  0.0   
E103                     0.0  0.000000           0.0         0.0  0.0   
E104                     0.0  0.000000           0.0         0.0  0.0   
E105                     0.0  0.000000           0.0         0.0  

In [155]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_emp["Years_Experience_Norm"] = scaler.fit_transform(df_emp[["Years_Experience"]])

print(df_emp[["Years_Experience", "Years_Experience_Norm"]].head())


   Years_Experience  Years_Experience_Norm
0              14.5               0.971223
1               2.2               0.086331
2               3.9               0.208633
3               8.3               0.525180
4               6.0               0.359712


In [156]:
df_emp_dept = pd.get_dummies(df_emp["Department"], prefix="Dept")
df_emp_loc = pd.get_dummies(df_emp["Location"], prefix="Loc")

# Concatenate with main employee DataFrame
df_emp_features = pd.concat([df_emp[["Employee_ID", "Years_Experience_Norm"]], df_emp_dept, df_emp_loc], axis=1)

df_emp_features.set_index("Employee_ID", inplace=True)
print(df_emp_features.head())


             Years_Experience_Norm  Dept_AI Research  Dept_Cloud Engineering  \
Employee_ID                                                                    
E101                      0.971223              True                   False   
E102                      0.086331             False                   False   
E103                      0.208633             False                   False   
E104                      0.525180              True                   False   
E105                      0.359712             False                   False   

             Dept_Cybersecurity  Dept_Data Science  Dept_DevOps  \
Employee_ID                                                       
E101                      False              False        False   
E102                      False              False         True   
E103                      False              False        False   
E104                      False              False        False   
E105                      False      

In [157]:
# Example combining with binary skill matrix
employee_features_combined = pd.concat([employee_matrix, df_emp_features], axis=1)

print(employee_features_combined.shape)
employee_features_combined.head()


(1000, 135)


Unnamed: 0_level_0,agile,agile project management,analytics,api,api development,apis,architecture,automation,aws,azure,...,Dept_AI Research,Dept_Cloud Engineering,Dept_Cybersecurity,Dept_Data Science,Dept_DevOps,Dept_Full Stack Dev,Loc_Chennai,Loc_Delhi,Loc_Mumbai,Loc_Pune
Employee_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E101,0,0,0,0,0,0,0,0,0,1,...,True,False,False,False,False,False,False,False,False,True
E102,0,0,0,1,0,1,0,0,0,0,...,False,False,False,False,True,False,False,False,False,True
E103,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,True,False
E104,1,0,0,0,0,0,0,0,0,0,...,True,False,False,False,False,False,True,False,False,False
E105,0,0,0,0,0,0,0,0,0,1,...,False,False,False,True,False,False,False,False,True,False


In [158]:
import pickle

# Save employee skill matrix
with open("../model/employee_skill_matrix.pkl", "wb") as f:
    pickle.dump(employee_matrix, f)

# Save project skill matrix
with open("../model/project_skill_matrix.pkl", "wb") as f:
    pickle.dump(project_matrix, f)

with open("../model/employee_features_combined.pkl", "wb") as f:
    pickle.dump(employee_features_combined, f)

print("✅ Combined employee feature matrix saved.")

print("✅ Saved employee_skill_matrix.pkl and project_skill_matrix.pkl in ../data/processed/")


✅ Combined employee feature matrix saved.
✅ Saved employee_skill_matrix.pkl and project_skill_matrix.pkl in ../data/processed/
