In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.feature_selection import RFE
import math

file = pd.read_csv('Absenteeism_data.csv')

# Handling missing data
numeric_data = file.select_dtypes(include=['float64', 'int64']).columns
file[numeric_data].fillna(file[numeric_data].mean(), inplace=True)

# Features and target variable
X = file.drop(columns=['Date', 'ID', 'Absenteeism Time in Hours'])
Y = file['Absenteeism Time in Hours']

# Standardize features
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

# Recursive Feature Elimination
def perform_rfe(model, X_scale, Y, n_features):
    rfe = RFE(estimator=model, n_features_to_select=n_features)
    X_rfe = rfe.fit_transform(X_scale, Y)
    selected_features = X.columns[rfe.support_]
    return X_rfe, selected_features

# Evaluation function
def evaluate_model(y_test, y_pred, model_name, num_features):
    mse = mean_squared_error(y_test, y_pred)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Adjusted R² calculation
    adjusted_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - num_features - 1)
    
    print(f"{model_name} Mean Squared Error (MSE): {mse:.4f}")
    print(f"{model_name} Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"{model_name} Mean Absolute Error (MAE): {mae:.4f}")
    print(f"{model_name} R² Score: {r2:.4f}")
    print(f"{model_name} Adjusted R² Score: {adjusted_r2:.4f}")
    print()

# Different numbers of features to select
n = [3, 5, 7, 9]
for n_features_to_select in n:
    print("*" * 120)
    print(f"{n_features_to_select} Components")
    
    # Perform RFE for different models
    X_rfe_linear, selected_features_linear = perform_rfe(LinearRegression(), X_scale, Y, n_features_to_select)
    X_rfe_ridge, selected_features_ridge = perform_rfe(Ridge(alpha=1), X_scale, Y, n_features_to_select)
    X_rfe_lasso, selected_features_lasso = perform_rfe(Lasso(alpha=1), X_scale, Y, n_features_to_select)

    print("Selected Features for Linear Regression:", selected_features_linear)
    print("Selected Features for Ridge Regression:", selected_features_ridge)
    print("Selected Features for Lasso Regression:", selected_features_lasso)

    # Train-test split
    X_train_linear, X_test_linear, Y_train, Y_test = train_test_split(X_rfe_linear, Y, test_size=0.2, random_state=42)
    X_train_ridge, X_test_ridge, _, _ = train_test_split(X_rfe_ridge, Y, test_size=0.2, random_state=42)
    X_train_lasso, X_test_lasso, _, _ = train_test_split(X_rfe_lasso, Y, test_size=0.2, random_state=42)

    # Initialize models
    linear_regression_model = LinearRegression()
    ridge_regression_model = Ridge(alpha=1)
    lasso_regression_model = Lasso(alpha=1)

    # Train models
    linear_regression_model.fit(X_train_linear, Y_train)
    ridge_regression_model.fit(X_train_ridge, Y_train)
    lasso_regression_model.fit(X_train_lasso, Y_train)

    # Make predictions
    y_pred_linear = linear_regression_model.predict(X_test_linear)
    y_pred_ridge = ridge_regression_model.predict(X_test_ridge)
    y_pred_lasso = lasso_regression_model.predict(X_test_lasso)

    # Evaluate models with number of features for adjusted R² calculation
    print("Linear Regression")
    evaluate_model(Y_test, y_pred_linear, "Linear Regression", n_features_to_select)
    
    print("Ridge Regression")
    evaluate_model(Y_test, y_pred_ridge, "Ridge Regression", n_features_to_select)
    
    print("Lasso Regression")
    evaluate_model(Y_test, y_pred_lasso, "Lasso Regression", n_features_to_select)


************************************************************************************************************************
3 Components
Selected Features for Linear Regression: Index(['Reason for Absence', 'Distance to Work', 'Children'], dtype='object')
Selected Features for Ridge Regression: Index(['Reason for Absence', 'Distance to Work', 'Children'], dtype='object')
Selected Features for Lasso Regression: Index(['Reason for Absence', 'Children', 'Pets'], dtype='object')
Linear Regression
Linear Regression Mean Squared Error (MSE): 145.7017
Linear Regression Root Mean Squared Error (RMSE): 12.0707
Linear Regression Mean Absolute Error (MAE): 5.4260
Linear Regression R² Score: 0.0700
Linear Regression Adjusted R² Score: 0.0495

Ridge Regression
Ridge Regression Mean Squared Error (MSE): 145.7112
Ridge Regression Root Mean Squared Error (RMSE): 12.0711
Ridge Regression Mean Absolute Error (MAE): 5.4264
Ridge Regression R² Score: 0.0699
Ridge Regression Adjusted R² Score: 0.0494

Lasso R

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file[numeric_data].fillna(file[numeric_data].mean(), inplace=True)
