In [25]:
# Load dataset
import os 
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA1_DIR = os.path.join(BASE_DIR, "Data", "processed")

In [26]:
# Read Dataset
import pandas as pd
placement_df=os.path.join(DATA1_DIR,"placement_master.csv")
placement_master_df=pd.read_csv(placement_df)
placement_master_df.head()

Unnamed: 0,cgpa,college_tier,branch,internship_count,project_count,skills_score,communication_score,certifications,dsa_score,placed
0,6.29,2,ECE,0,3,4,6,1,4,0
1,6.05,2,ECE,1,4,6,8,2,6,0
2,7.22,2,ME,1,4,6,6,2,6,0
3,7.78,2,ME,2,4,6,6,2,6,1
4,7.63,2,ME,1,4,6,5,2,6,1


In [27]:
# Normalization
cols_normalize=[
    "cgpa",
    "internship_count",
    "project_count",
    "skills_score",
    "communication_score",
    "certifications",
    "dsa_score",
]
for cols in cols_normalize:
    placement_master_df[cols+"_norm"]=placement_master_df[cols]/placement_master_df[cols].max()

In [28]:
# profile strength index
placement_master_df["profile_strength_index"]=(
    0.25 * placement_master_df["cgpa_norm"] +
    0.20 * placement_master_df["internship_count_norm"] +
    0.20 * placement_master_df["project_count_norm"] +
    0.15 * placement_master_df["skills_score_norm"] +
    0.10 * placement_master_df["communication_score_norm"] +
    0.10 * placement_master_df["certifications_norm"]
)

In [29]:
# techanical strength
placement_master_df["technical_strength"] = (
    0.4 * placement_master_df["dsa_score_norm"] +
    0.3 * placement_master_df["skills_score_norm"] +
    0.3 * placement_master_df["project_count_norm"]
)

In [30]:
# interaction
placement_master_df["cgpa_internship"] = placement_master_df["cgpa_norm"] * placement_master_df["internship_count_norm"]

placement_master_df["project_skill_interaction"] = (
    placement_master_df["project_count_norm"] * placement_master_df["skills_score_norm"]
)

placement_master_df["internship_communication"] = (
    placement_master_df["internship_count_norm"] * placement_master_df["communication_score_norm"]
)

placement_master_df["psi_tech_interaction"] = (
    placement_master_df["profile_strength_index"] * placement_master_df["technical_strength"]
)

In [31]:
# college tier weight
placement_master_df["college_weight"] = 1 / placement_master_df["college_tier"]

In [32]:
# Save files
os.makedirs(DATA1_DIR, exist_ok=True)
engineered_file_path = os.path.join(DATA1_DIR, "placement_engineered.csv")
placement_master_df.to_csv(engineered_file_path, index=False)

In [33]:
# check imabalancing 
placement_master_df["placed"].value_counts()
placement_master_df.describe()
placement_master_df.corr(numeric_only=True)["placed"].sort_values()

internship_count             0.305416
internship_count_norm        0.305416
communication_score_norm     0.326256
communication_score          0.326256
cgpa_internship              0.344529
internship_communication     0.396338
dsa_score                    0.439541
skills_score                 0.439541
skills_score_norm            0.439541
dsa_score_norm               0.439541
project_skill_interaction    0.460815
technical_strength           0.463789
certifications               0.474312
certifications_norm          0.474312
cgpa_norm                    0.490489
cgpa                         0.490489
psi_tech_interaction         0.499042
project_count_norm           0.500622
project_count                0.500622
profile_strength_index       0.541496
placed                       1.000000
college_tier                      NaN
college_weight                    NaN
Name: placed, dtype: float64