In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
employee_data = pd.read_csv("C:\\Users\\Administrator\\Desktop\\iBit_Soft_Project_-1\\iBit_Soft_Project\\TemaTimeSync\\train.csv")


In [None]:
# Encode categorical features (whether the person is late or not)
label_encoders = {}
for column in employee_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    employee_data[column] = le.fit_transform(employee_data[column])
    label_encoders[column] = le


In [None]:
employee_data = employee_data.drop(columns=['LAST LOGOUT'])

In [None]:
employee_data

In [None]:
# Feature-target separation
X = employee_data.drop(columns=['IS LATE'])  
y = employee_data['IS LATE']


In [None]:
#splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = AdaBoostClassifier(n_estimators=100, learning_rate=0.1)
#Model training
model.fit(X_train, y_train)
#Model Testing
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
report = classification_report(y_test, y_pred)
print('The Classification Report:')
print(report)

In [None]:
#initializing the model(most deal for making the predictions)
model_two = DecisionTreeClassifier(max_depth=2000)
#Model training
model_two.fit(X_train, y_train)
#Model Testing
y_pred = model_two.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


In [None]:
report = classification_report(y_test, y_pred)
print('The Classification Report:')
print(report)

In [None]:
#initializing the model(most deal for making the predictions)
model_three = RandomForestClassifier(n_estimators=100, random_state=42)
#Model training
model_three.fit(X_train, y_train)
#Model Testing
y_pred = model_three.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


In [None]:
report = classification_report(y_test, y_pred)
print('The Classification Report:')
print(report)

In [None]:
estimators = [
    ("Banana",model),
    ("Apple", model_two),
    ("Shirt", model_three),
]

stack_model = StackingClassifier(estimators=estimators, final_estimator=DecisionTreeClassifier(max_depth=1000))
stack_model.fit(X_train, y_train)
#Model Testing
y_pred = stack_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
report = classification_report(y_test, y_pred)
print('The Classification Report:')
print(report)

In [None]:
# Refined grid search for hyperparameter tuning of the best estimator

refined_param_grid = {
    'final_estimator__max_depth': [12, 15, 18],
    'final_estimator__min_samples_split': [5, 7, 10]
}

refined_grid_search = GridSearchCV(stack_model, refined_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the refined grid search on the training data
refined_grid_search.fit(X_train, y_train)

# Best estimator from the refined grid search
stack_model2 = refined_grid_search.best_estimator_

# Fit the model on the training data
stack_model2.fit(X_train, y_train)

# Predict the target values on the test data using the model
y_pred_final = stack_model2.predict(X_test)

# Evaluate the model using various metrics
print(f"Stack Model 2 Accuracy Score: {accuracy_score(y_test, y_pred_final)}")


In [None]:
report = classification_report(y_test, y_pred)
print('The Classification Report:')
print(report)

In [None]:
# Define base models for ensemble
models = [
    ('dt', model_two),
    ('tmodel1', stack_model),
    ('tmodel2', stack_model2),
    ('adaboost', model),
    ('rforest', model_three)
]

# Initialize and fit Voting Classifier as ensemble model with base models
ensemble_model = VotingClassifier(estimators=models)

# Fit
ensemble_model.fit(X_train, y_train)

# Predict the target values on the test data using the ensemble model
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluate the ensemble model using various metrics
print(f"Ensemble Model Accuracy Score: {accuracy_score(y_test, y_pred_ensemble)}")

In [None]:
report = classification_report(y_test, y_pred)
print('The Classification Report:')
print(report)

In [None]:

# Save the model, scaler, and label encoders
joblib.dump(ensemble_model, 'lateness_prediction_model.pkl')


In [None]:
joblib.dump(label_encoders, 'label_encoders.pkl')