In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv(r"data\WA_Fn-UseC_-HR-Employee-Attrition.csv")

# Display the first few rows of the dataset
print(data.head())

# Initialize LabelEncoder
le = LabelEncoder()

# Encode categorical columns
data["Gender"] = le.fit_transform(data["Gender"])
data["OverTime"] = le.fit_transform(data["OverTime"])
data["Department"] = le.fit_transform(data["Department"])
data["JobRole"] = le.fit_transform(data["JobRole"])
data["MaritalStatus"] = le.fit_transform(data["MaritalStatus"])
data["Attrition"] = le.fit_transform(data["Attrition"])

# Select features and target variable
X = data[["Age", "DistanceFromHome", "Education", "EnvironmentSatisfaction", "HourlyRate", "JobInvolvement",
          "JobLevel", "JobSatisfaction", "MonthlyIncome", "NumCompaniesWorked", "PercentSalaryHike",
          "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears",
          "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion",
          "YearsWithCurrManager", "Gender", "OverTime", "Department", "JobRole", "MaritalStatus"]]
y = data["Attrition"]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
dt = DecisionTreeClassifier(criterion="entropy", random_state=42)
lrr = LinearRegression()

# Train Decision Tree Classifier
dt.fit(X_train, y_train)
y_dt_pred = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_dt_pred)
print("Decision Tree Classifier Accuracy:", accuracy_dt)

# Train Linear Regression Model
lrr.fit(X_train, y_train)
y_lrr_pred = lrr.predict(X_test)

# Evaluate Linear Regression Model
mse = mean_squared_error(y_test, y_lrr_pred)
r2 = r2_score(y_test, y_lrr_pred)
print("Linear Regression Mean Squared Error:", mse)
print("Linear Regression R-squared:", r2)


   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...