In [13]:
import pandas as pd

# Create a small fake employee dataset
data = {
    'EmployeeName': ['Alice', 'Bob', 'Carlos', 'Diana', 'Evelyn', 'Frank', 'Grace', 'Hector', 'Isabel', 'James'],
    'Gender': ['Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male'],
    'Department': ['Sales', 'Engineering', 'HR', 'Engineering', 'Sales', 'Finance', 'HR', 'Engineering', 'Sales', 'Finance'],
    'Age': [25, 45, 30, 40, 28, 50, 36, 33, 29, 55],
    'MonthlyIncome': [3000, 8000, 4500, 7000, 3200, 9000, 5000, 5200, 3100, 9500],
    'JobSatisfaction': [3, 2, 4, 1, 3, 2, 3, 4, 2, 1],
    'YearsAtCompany': [1, 10, 3, 8, 2, 15, 5, 4, 1, 20],
    'OverTime': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No'],
    'Attrition': ['Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No']
}

# Turn it into a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('employee_data.csv', index=False)

# Preview
df.head()

# Convert Yes/No and Male/Female to 1s and 0s
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})
df['OverTime'] = df['OverTime'].map({'Yes': 1, 'No': 0})
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Turn department into dummy variables (one-hot encoding)
df = pd.get_dummies(df, columns=['Department'], drop_first=True)

# Preview cleaned data
df.head()

# Step 1: Define the target (what we want to predict)
y = df['Attrition']

# Step 2: Define the features (what we use to predict)
X = df.drop(columns=['EmployeeName', 'Attrition'])  # Drop name and target

# Check what our final features look like
X.head()

# Check what our final features look like
X.head()

# Step 3: Split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)

from sklearn.linear_model import LogisticRegression

# Create the model
model = LogisticRegression(max_iter=1000)

# Train the model using the training data
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the model:", accuracy)

from sklearn.metrics import confusion_matrix, classification_report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

import pandas as pd

# Get the model’s coefficients and match them to the column names
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
})

# Sort to see most important factors
coefficients = coefficients.sort_values(by='Coefficient', ascending=False)

coefficients


Accuracy of the model: 1.0
Confusion Matrix:
[[2 0]
 [0 2]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



Unnamed: 0,Feature,Coefficient
5,OverTime,0.211668
8,Department_Sales,0.153421
3,JobSatisfaction,0.137616
0,Gender,0.057834
6,Department_Finance,-0.000413
2,MonthlyIncome,-0.000605
1,Age,-0.088861
4,YearsAtCompany,-0.149793
7,Department_HR,-0.664714
