In [2]:
import random
import pandas as pd
import numpy as np
import sklearn
import pickle
from sklearn import tree
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV

In [3]:
# Define departments
departments = ["Marketing", "Sales", "Engineering", "Human Resources", "Finance", "Operations", "IT"]

# Define job titles
job_titles = {
    "Marketing": ["Marketing Manager", "Content Creator", "Social Media Specialist"],
    "Sales": ["Account Executive", "Business Development Rep", "Sales Representative"],
    "Engineering": ["Software Engineer", "Data Scientist", "QA Engineer"],
    "Human Resources": ["HR Specialist", "Recruiter", "Talent Manager"],
    "Finance": ["Accountant", "Financial Analyst", "Controller"],
    "Operations": ["Project Manager", "Supply Chain Manager", "Operations Analyst"],
    "IT": ["Network Administrator", "Software Developer", "IT Support Specialist"]
}

# Define performance ratings
performance_ratings = ["Exceeds Expectations", "Meets Expectations", "Needs Improvement"]


def generate_data(num_employees):
  """
  Generates a synthetic dataset for employee attrition analysis.

  Args:
      num_employees: Number of employees to generate data for.

  Returns:
      A list of dictionaries, where each dictionary represents an employee.
  """
  data = []
  for _ in range(num_employees):
    # Select department and job title
    department = random.choice(departments)
    job_title = random.choice(job_titles[department])

    # Generate numerical data
    tenure = random.randint(1, 10)
    age = random.randint(22, 55)
    salary = random.randint(50000, 150000)
    manager_satisfaction = random.randint(1, 10)
    work_life_balance = random.randint(1, 10)
    num_children = random.randint(0, 4)  # Add number of children

    # Assign performance rating based on probability
    performance_rating = random.choices(performance_ratings, weights=[0.2, 0.6, 0.2])[0]

    # Determine attrition based on probability (example logic, can be adjusted)
    attrition_chance = 0.1 + 0.05 * (10 - tenure) + 0.1 * (3 - work_life_balance)
    attrition = "Yes" if random.random() < attrition_chance else "No"

    # Create employee data dictionary
    employee_data = {
        "Department": department,
        "Job Title": job_title,
        "Tenure (Years)": tenure,
        "Age": age,
        "Performance Rating": performance_rating,
        "Salary (USD)": salary,
        "Manager Satisfaction": manager_satisfaction,
        "Work-Life Balance": work_life_balance,
        "Number of Children": num_children,
        "Attrition (Yes/No)": attrition
    }

    data.append(employee_data)

  return data

# Generate data for 100 employees
data = generate_data(4000)

In [4]:
df = pd.DataFrame(data)
print(df)

           Department                 Job Title  Tenure (Years)  Age  \
0     Human Resources            Talent Manager               1   45   
1               Sales  Business Development Rep               1   32   
2           Marketing         Marketing Manager               4   24   
3     Human Resources            Talent Manager               2   41   
4         Engineering         Software Engineer               6   50   
...               ...                       ...             ...  ...   
3995            Sales      Sales Representative               9   34   
3996          Finance         Financial Analyst               4   40   
3997          Finance         Financial Analyst               6   25   
3998        Marketing           Content Creator               7   38   
3999      Engineering               QA Engineer               4   23   

      Performance Rating  Salary (USD)  Manager Satisfaction  \
0     Meets Expectations        141379                     6   
1      

In [5]:
# Map specific values to department and job title columns
df_mapped = df.copy()

# Define mappings for departments and job titles
department_mapping = {department: index for index, department in enumerate(departments)}
job_title_mapping = {department: {job_title: index for index, job_title in enumerate(job_titles[department])} for department in departments}

# Apply mappings to department and job title columns
df_mapped['Department'] = df_mapped['Department'].map(department_mapping)
df_mapped['Job Title'] = df_mapped.apply(lambda row: job_title_mapping[departments[row['Department']]][row['Job Title']], axis=1)

# Map "Yes" to 1 and "No" to 0 for the Attrition column
df_mapped['Attrition (Yes/No)'] = df_mapped['Attrition (Yes/No)'].map({'Yes': 1, 'No': 0})

# Map performance ratings to specific values
performance_mapping = {'Exceeds Expectations': 3, 'Meets Expectations': 2, 'Needs Improvement': 1}
df_mapped['Performance Rating'] = df_mapped['Performance Rating'].map(performance_mapping)

# Display the updated dataframe
print(df_mapped)

predictor="Attrition (Yes/No)"
x=np.array(df_mapped.drop([predictor],axis=1))
y=np.array(df_mapped[predictor])

x_train,x_test,y_train,y_test=sklearn.model_selection.train_test_split(x,y,test_size=0.2)

best_accuracy = 0
best_model = None

# Loop through parameter combinations
for i in range(0,30):
        # Initialize Decision Tree classifier with current parameters
        model = tree.DecisionTreeClassifier(max_depth=4, criterion="entropy", random_state=42)
        
        # Train the model
        model.fit(x_train, y_train)
        
        # Evaluate the model
        accuracy = model.score(x_test, y_test)
        
        # Check if current model has higher accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model

# # Save the best model to a pickle file
# with open('decisiontree.pkl', 'wb') as file:
#     pickle.dump(best_model, file)

print("Best Decision Tree Model Accuracy:", best_accuracy)

      Department  Job Title  Tenure (Years)  Age  Performance Rating  \
0              3          2               1   45                   2   
1              1          1               1   32                   1   
2              0          0               4   24                   1   
3              3          2               2   41                   2   
4              2          0               6   50                   2   
...          ...        ...             ...  ...                 ...   
3995           1          2               9   34                   2   
3996           4          1               4   40                   1   
3997           4          1               6   25                   2   
3998           0          1               7   38                   2   
3999           2          2               4   23                   2   

      Salary (USD)  Manager Satisfaction  Work-Life Balance  \
0           141379                     6                  6   
1        