In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

np.random.seed(42)
print("‚úÖ Setup complete")


‚úÖ Setup complete


In [20]:
df = pd.read_csv('../Datasets/Placement_Dataset_EDA.csv')

In [21]:
df.columns

Index(['Email', 'Name', 'Gender', '10th board', '10th marks', '12th board',
       '12th marks', 'Stream', 'Cgpa', 'Internships(Y/N)', 'Training(Y/N)',
       'Backlog in 5th sem', 'Innovative Project(Y/N)', 'Communication level',
       'Technical Course(Y/N)', 'Placement(Y/N)?', 'Salary',
       'Package_Category', 'DSA_Score', 'Projects_Count', 'Cloud_Skills',
       'GitHub_Active', 'Competitive_Programming', 'Fullstack_Knowledge',
       'Technical_Skills_Score'],
      dtype='object')

In [22]:
df

Unnamed: 0,Email,Name,Gender,10th board,10th marks,12th board,12th marks,Stream,Cgpa,Internships(Y/N),...,Placement(Y/N)?,Salary,Package_Category,DSA_Score,Projects_Count,Cloud_Skills,GitHub_Active,Competitive_Programming,Fullstack_Knowledge,Technical_Skills_Score
0,payal_roy79@gmail.com,Payal Roy,Female,State Board,96.7,CBSE,70.2,Mechanical Engineering,7.37,No,...,Not Placed,0.0,Not Placed,2,0,No,No,0,No,4.0
1,shreyoshi_dey13@gmail.com,Shreyoshi Dey,Female,WBBSE,96.2,WBCHSE,90.6,Electronics and Communication Engineering,9.35,No,...,Not Placed,0.0,Not Placed,1,2,No,No,0,No,7.0
2,rohan_nandi12@gmail.com,Rohan Nandi,Male,State Board,97.5,CBSE,69.6,Information Technology,7.84,No,...,Placed,19.2,Standard,12,3,No,Yes,3,Yes,62.5
3,smita_agarwal90@gmail.com,Smita Agarwal,Female,CBSE,96.9,Other state Board,77.6,Computer Science in AIML,7.87,Yes,...,Not Placed,0.0,Not Placed,4,0,No,No,1,No,10.0
4,samaira_singhania95@gmail.com,Samaira Singhania,Female,ICSE,99.1,CBSE,62.8,Computer Science and Engineering,9.26,Yes,...,Not Placed,0.0,Not Placed,0,0,No,No,0,No,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,mandira_kapoor63@gmail.com,Mandira Kapoor,Female,ICSE,88.8,CBSE,89.1,Computer Science and Engineering,8.34,Yes,...,Not Placed,0.0,Not Placed,3,2,No,No,0,No,11.0
397,smita_yadav92@gmail.com,Smita Yadav,Female,State Board,86.7,CBSE,86.7,Computer Science and Engineering,7.77,Yes,...,Placed,20.2,Premium,15,7,Yes,Yes,5,Yes,97.5
398,manish_sinha90@gmail.com,Manish Sinha,Male,CBSE,83.5,CBSE,61.1,Electrical Engineering,8.85,No,...,Placed,19.9,Standard,9,4,Yes,Yes,4,Yes,76.0
399,pawan_sah12@gmail.com,Pawan Sah,Male,ICSE,99.8,ISE,75.4,Electronics and Communication Engineering,7.89,Yes,...,Not Placed,0.0,Not Placed,3,2,No,No,1,No,13.0


In [23]:
df_ml = df.copy()

# Drop unnecessary columns (including Backlog)
cols_to_drop = ['Email', 'Name', 'Gender', '10th board', '12th board', 'Stream', 
                'Salary', 'Placement(Y/N)?',
                'DSA_Score', 'Projects_Count', 'Cloud_Skills', 'GitHub_Active', 
                'Competitive_Programming', 'Fullstack_Knowledge',
                'Backlog in 5th sem']  # ‚Üê Added this!

df_ml = df_ml.drop(columns=cols_to_drop)

# Fix typos BEFORE encoding
df_ml['Innovative Project(Y/N)'] = df_ml['Innovative Project(Y/N)'].str.capitalize()
df_ml['Technical Course(Y/N)'] = df_ml['Technical Course(Y/N)'].str.capitalize()
df_ml['Technical Course(Y/N)'] = df_ml['Technical Course(Y/N)'].replace('Yess', 'Yes')


In [24]:

label_encoders = {}
categorical_cols = ['Internships(Y/N)', 'Training(Y/N)', 
                    'Innovative Project(Y/N)', 'Technical Course(Y/N)']

for col in categorical_cols:
    le = LabelEncoder()
    df_ml[col] = le.fit_transform(df_ml[col])
    label_encoders[col] = le

# Encode Package_Category (target)
le_target = LabelEncoder()
df_ml['Package_Category'] = le_target.fit_transform(df_ml['Package_Category'])
label_encoders['Package_Category'] = le_target

print("‚úÖ All encoding complete!")
print(f"\nFinal shape: {df_ml.shape}")
print(f"\nColumns: {df_ml.columns.tolist()}")

print("\nY/N Encoding:")
for col in categorical_cols:
    print(f"  {col}: {dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))}")

print("\nPackage_Category Encoding:")
print(f"  {dict(zip(label_encoders['Package_Category'].classes_, label_encoders['Package_Category'].transform(label_encoders['Package_Category'].classes_)))}")


‚úÖ All encoding complete!

Final shape: (401, 10)

Columns: ['10th marks', '12th marks', 'Cgpa', 'Internships(Y/N)', 'Training(Y/N)', 'Innovative Project(Y/N)', 'Communication level', 'Technical Course(Y/N)', 'Package_Category', 'Technical_Skills_Score']

Y/N Encoding:
  Internships(Y/N): {'No': np.int64(0), 'Yes': np.int64(1)}
  Training(Y/N): {'No': np.int64(0), 'Yes': np.int64(1)}
  Innovative Project(Y/N): {'No': np.int64(0), 'Yes': np.int64(1)}
  Technical Course(Y/N): {'No': np.int64(0), 'Yes': np.int64(1)}

Package_Category Encoding:
  {'Basic': np.int64(0), 'Not Placed': np.int64(1), 'Premium': np.int64(2), 'Standard': np.int64(3)}


In [25]:
# Verify Package_Category encoding
print("Package_Category Encoding:")
print(f"  {dict(zip(label_encoders['Package_Category'].classes_, label_encoders['Package_Category'].transform(label_encoders['Package_Category'].classes_)))}")

print("\nPackage_Category value counts (encoded):")
print(df_ml['Package_Category'].value_counts().sort_index())

print("\nSample data check:")
print(df_ml.head())

print("\nData types (should all be numeric):")
print(df_ml.dtypes)


Package_Category Encoding:
  {'Basic': np.int64(0), 'Not Placed': np.int64(1), 'Premium': np.int64(2), 'Standard': np.int64(3)}

Package_Category value counts (encoded):
Package_Category
0     11
1    202
2    118
3     70
Name: count, dtype: int64

Sample data check:
   10th marks  12th marks  Cgpa  Internships(Y/N)  Training(Y/N)  \
0        96.7        70.2  7.37                 0              1   
1        96.2        90.6  9.35                 0              0   
2        97.5        69.6  7.84                 0              1   
3        96.9        77.6  7.87                 1              0   
4        99.1        62.8  9.26                 1              1   

   Innovative Project(Y/N)  Communication level  Technical Course(Y/N)  \
0                        0                    3                      1   
1                        1                    4                      0   
2                        1                    3                      1   
3                        1

In [26]:
# Save preprocessed ML-ready data
df_ml.to_csv('../Datasets/Placement_Dataset_Preprocessed.csv', index=False)


print("‚úÖ SAVED!")
print("\nüìÑ Placement_Data_Preprocessed.csv")
print(f"   Shape: {df_ml.shape}")


print("\nüìã Encoding Reference:")
print("   Package_Category:")
print("     0 = Basic")
print("     1 = Not Placed")
print("     2 = Premium")
print("     3 = Standard")


print("\nüéâ NOTEBOOK 1 COMPLETE!")


‚úÖ SAVED!

üìÑ Placement_Data_Preprocessed.csv
   Shape: (401, 10)

üìã Encoding Reference:
   Package_Category:
     0 = Basic
     1 = Not Placed
     2 = Premium
     3 = Standard

üéâ NOTEBOOK 1 COMPLETE!


In [27]:
# Check what columns you actually have
print("üìã Current columns in dataframe:")
print(df.columns.tolist())


üìã Current columns in dataframe:
['Email', 'Name', 'Gender', '10th board', '10th marks', '12th board', '12th marks', 'Stream', 'Cgpa', 'Internships(Y/N)', 'Training(Y/N)', 'Backlog in 5th sem', 'Innovative Project(Y/N)', 'Communication level', 'Technical Course(Y/N)', 'Placement(Y/N)?', 'Salary', 'Package_Category', 'DSA_Score', 'Projects_Count', 'Cloud_Skills', 'GitHub_Active', 'Competitive_Programming', 'Fullstack_Knowledge', 'Technical_Skills_Score']
