In [None]:
# Updated function to generate synthetic employee data with 'Attrition' and 'EducationLevel'
def generate_synthetic_data(n=50000):
    np.random.seed(42)
    
    # Risk distribution (30% low, 50% medium, 20% high risk)
    risk_distribution = np.random.choice(
        ['Low Risk', 'Medium Risk', 'High Risk'],
        size=n,
        p=[0.3, 0.5, 0.2]
    )
    
    # Job Satisfaction (scale of 1-10) based on risk category
    job_satisfaction = np.where(
        risk_distribution == 'Low Risk', np.random.randint(8, 11, size=n),
        np.where(risk_distribution == 'Medium Risk', np.random.randint(4, 8, size=n),
                 np.random.randint(1, 4, size=n))
    )
    
    # Engagement Score (scale of 0-100) based on risk category
    engagement_score = np.where(
        risk_distribution == 'Low Risk', np.random.randint(75, 101, size=n),
        np.where(risk_distribution == 'Medium Risk', np.random.randint(40, 75, size=n),
                 np.random.randint(0, 40, size=n))
    )
    
    # Performance Rating (scale of 1-5) based on risk category
    performance_rating = np.where(
        risk_distribution == 'Low Risk', np.random.randint(4, 6, size=n),
        np.where(risk_distribution == 'Medium Risk', np.random.randint(2, 4, size=n),
                 np.random.randint(1, 2, size=n))
    )
    
    # Work-Life Balance (scale of 1-5) based on risk category
    work_life_balance = np.where(
        risk_distribution == 'Low Risk', np.random.randint(4, 6, size=n),
        np.where(risk_distribution == 'Medium Risk', np.random.randint(2, 4, size=n),
                 np.random.randint(1, 2, size=n))
    )
    
    # Overtime (Yes/No) based on risk category
    overtime = np.where(
        risk_distribution == 'High Risk', np.random.choice(['Yes'], size=n),
        np.random.choice(['No', 'Yes'], size=n, p=[0.7, 0.3])
    )
    
    # Education Level (High School, Bachelor's, Master's)
    education_level = np.random.choice(['High School', "Bachelor's", "Master's"], size=n, p=[0.3, 0.5, 0.2])
    
    # Other features that aren't necessarily tied to risk levels
    age = np.random.randint(22, 60, size=n)
    gender = np.random.choice(['Male', 'Female'], size=n, p=[0.5, 0.5])
    department = np.random.choice(['R&D', 'Sales', 'HR', 'Marketing', 'Finance'], size=n)
    tenure = np.random.randint(1, 21, size=n)
    job_role = np.random.choice(['Engineer', 'Manager', 'Technician', 'Analyst', 'Developer'], size=n)
    salary = np.random.randint(40000, 120000, size=n)
    distance_from_home = np.random.randint(1, 50, size=n)
    training_hours = np.random.randint(10, 100, size=n)
    
    # Create 'Attrition' based on risk category: High Risk employees have a higher chance of leaving
    attrition = np.where(risk_distribution == 'High Risk', 1, 0)
    
    # Combine all features into a DataFrame
    data = pd.DataFrame({
        'EmployeeID': [f'E{str(i).zfill(5)}' for i in range(1, n + 1)],
        'Age': age,
        'Gender': gender,
        'Department': department,
        'Tenure': tenure,
        'JobRole': job_role,
        'Salary': salary,
        'JobSatisfaction': job_satisfaction,
        'EngagementScore': engagement_score,
        'PerformanceRating': performance_rating,
        'WorkLifeBalance': work_life_balance,
        'DistanceFromHome': distance_from_home,
        'TrainingHours': training_hours,
        'Overtime': overtime,  # Add Overtime column
        'EducationLevel': education_level,  # Add EducationLevel column
        'Risk Category': risk_distribution,
        'Attrition': attrition  # Add Attrition column as the target variable
    })
    
    return data

# Generate and save the updated synthetic data
synthetic_data = generate_synthetic_data(n=50000)
synthetic_data.to_csv('synthetic_employee_data.csv', index=False)


: 

#

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import pickle

# Load your synthetic dataset here
df = pd.read_csv("synthetic_employee_data.csv")

# Define features and target
X = df.drop(columns=["Attrition", "EmployeeID"])
y = df["Attrition"]

# Define categorical and numerical features
categorical_features = ["Gender", "Department", "JobRole", "Overtime", "EducationLevel"]
numerical_features = ["Age", "Tenure", "Salary", "JobSatisfaction", "EngagementScore", 
                      "PerformanceRating", "DistanceFromHome", "TrainingHours", "WorkLifeBalance"]

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(), categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the preprocessor to the training data and transform the features
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Train Random Forest with more estimators and reduced tree depth
rf_model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)
rf_model.fit(X_train, y_train)

# Train AdaBoost with more estimators and a higher learning rate
ada_model = AdaBoostClassifier(n_estimators=500, learning_rate=0.2, random_state=42)
ada_model.fit(X_train, y_train)

# Save the models
with open('rf_model.pkl', 'wb') as f:
    pickle.dump("C:\dev\Employee-Attrition-Prediction-Model\rf_model", f)
with open("C:\dev\Employee-Attrition-Prediction-Modelada_model.pkl", 'wb') as f:
    pickle.dump(ada_model, f)

print("Models retrained and saved.")


In [None]:
print(df.columns)  # This will print all the columns in your dataset
