In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import os

# ==========================================
# 1. LOAD DATA
# ==========================================
csv_path = r"D:\infosys\job_role_preditor\dataset\education_career_success_UPDATED.csv"
print(f"üìÇ Loading Data from: {csv_path}")

if not os.path.exists(csv_path):
    print("‚ùå Error: File not found! Check the path.")
    exit()

df = pd.read_csv(csv_path)

# ==========================================
# 1.1 AUTO-FIX COLUMN NAMES
# ==========================================
df.columns = df.columns.str.strip()
rename_map = {
    'Internship': 'Internships_Completed',
    'Internships': 'Internships_Completed',
    'GPA': 'University_GPA',
    'CGPA': 'University_GPA',
    'Specialization': 'Field_of_Study',
    'Certification': 'Certifications',
    'Certifications': 'Certifications',
    'Job_Roles': 'Job_Role',
    'Role': 'Job_Role'
}
df.rename(columns=rename_map, inplace=True)

# Basic Cleanup
df['Job_Role'] = df['Job_Role'].replace({'Software Engineer': 'Software Developer'})
df['Internships_Completed'] = df['Internships_Completed'].fillna(0)
df['University_GPA'] = df['University_GPA'].fillna(0.0)
df['Certifications'] = df['Certifications'].fillna("None").astype(str).str.lower()
df['Field_of_Study'] = df['Field_of_Study'].fillna('Other').astype(str).str.strip()
df['Degree'] = df['Degree'].fillna('Other').astype(str).str.strip() # Ensure Degree is clean

# ==========================================
# 2. FEATURE ENGINEERING (The Upgrade)
# ==========================================

# A. KEY SKILLS (This boosts accuracy significantly)
KEY_SKILLS = ['python', 'java', 'aws', 'react', 'cpa', 'autocad', 'scrum', 'six sigma']
print("üß† Extracting Skill Features...")

skill_flags = []
for skill in KEY_SKILLS:
    col_name = f'Has_{skill}'
    # Create the column
    df[col_name] = df['Certifications'].apply(lambda x: 1 if skill in x else 0)
    skill_flags.append(col_name)

# B. Cert Count
def count_certs(val):
    if val == "none" or not val.strip(): return 0
    return len(val.split(','))
df['Certifications_Count'] = df['Certifications'].apply(count_certs)

# ==========================================
# 3. ENCODING & SCALING
# ==========================================

# A. Degree (CRITICAL MISSING PIECE)
le_degree = LabelEncoder()
df['Degree_Encoded'] = le_degree.fit_transform(df['Degree'])

# B. Field
le_field = LabelEncoder()
df['Field_Encoded'] = le_field.fit_transform(df['Field_of_Study'])

# C. GPA
scaler = MinMaxScaler()
df['GPA_Scaled'] = scaler.fit_transform(df[['University_GPA']])

# D. Target
le_target = LabelEncoder()
df['Role_Encoded'] = le_target.fit_transform(df['Job_Role'])

# ==========================================
# 4. PREPARE X AND Y
# ==========================================
# We combine: Degree + Field + GPA + Cert Count + Internships + Specific Skills
feature_cols = ['Degree_Encoded', 'Field_Encoded', 'GPA_Scaled', 'Certifications_Count', 'Internships_Completed'] + skill_flags

X = df[feature_cols].values
y = df['Role_Encoded'].values

# ==========================================
# 5. TRAIN & EVALUATE
# ==========================================
print("\nüìä Calculating Accuracy...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("="*40)
print(f"üéØ NEW ACCURACY SCORE: {acc * 100:.2f}%")
print("="*40)

# ==========================================
# 6. SAVE MODELS
# ==========================================
print("\nüíæ Saving updated models...")
save_path = '../backend/database/ml/models/'
os.makedirs(save_path, exist_ok=True)

with open(save_path + 'model.pkl', 'wb') as f: pickle.dump(model, f)
with open(save_path + 'degree_encoder.pkl', 'wb') as f: pickle.dump(le_degree, f) # Save Degree Encoder
with open(save_path + 'field_encoder.pkl', 'wb') as f: pickle.dump(le_field, f)
with open(save_path + 'scaler.pkl', 'wb') as f: pickle.dump(scaler, f)
with open(save_path + 'target_encoder.pkl', 'wb') as f: pickle.dump(le_target, f)

print(f"‚úÖ All models saved to {save_path}")

üìÇ Loading Data from: D:\infosys\job_role_preditor\dataset\education_career_success_UPDATED.csv
üß† Extracting Skill Features...

üìä Calculating Accuracy...
üéØ NEW ACCURACY SCORE: 87.10%

üíæ Saving updated models...
‚úÖ All models saved to ../backend/database/ml/models/
