In [73]:
# ! pip install catboost

In [74]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report




In [75]:
# Load the dataset
data = pd.read_csv('../Dataset/employees.csv')  # Replace 'your_dataset.csv' with your actual dataset file path

# Drop unnecessary columns
data.drop(['Employee_No', 'Employee_Code', 'Name','Date_Resigned','Date_Joined','Inactive_Date'], axis=1, inplace=True)

# Drop rows with missing values in Marital_Status during training
data_train = data.dropna(subset=['Marital_Status'])

In [76]:
# get a copy from data
data_copy = data.copy()

In [77]:
data.shape

(997, 13)

In [78]:
data_train.head()

Unnamed: 0,Title,Gender,Religion_ID,Marital_Status,Designation_ID,Status,Reporting_emp_1,Reporting_emp_2,Employment_Category,Employment_Type,Religion,Designation,Year_of_Birth
0,Mr,Male,1,Married,24,Active,\N,\N,Staff,Permanant,Buddhist,Driver,1965
1,Mr,Male,1,Married,24,Active,\N,\N,Staff,Permanant,Buddhist,Driver,1973
2,Mr,Male,1,Married,23,Inactive,\N,\N,Staff,Permanant,Buddhist,Account Clerk,1974
3,Ms,Female,4,Married,40,Inactive,\N,\N,Staff,Permanant,Catholic,Purchasing Officer,1974
4,Mr,Male,1,Married,7,Active,\N,\N,Staff,Permanant,Buddhist,Store Keeper,1980


In [79]:
# get missing value count in data columns
data_train.isnull().sum()

Title                  0
Gender                 0
Religion_ID            0
Marital_Status         0
Designation_ID         0
Status                 0
Reporting_emp_1        0
Reporting_emp_2        0
Employment_Category    0
Employment_Type        0
Religion               0
Designation            0
Year_of_Birth          0
dtype: int64

In [80]:
# Separate features (X) and target variable (y)
X = data_train.drop('Marital_Status', axis=1)
y = data_train['Marital_Status']


In [81]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((744, 12), (186, 12), (744,), (186,))

In [83]:
# Initialize CatBoost Classifier
model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss')

In [84]:
# Fit the model
model.fit(X_train, y_train, cat_features=list(X_train.select_dtypes(include=['object']).columns))

0:	learn: 0.6596339	total: 10.4ms	remaining: 1.02s
1:	learn: 0.6333796	total: 35.1ms	remaining: 1.72s
2:	learn: 0.6097135	total: 66ms	remaining: 2.13s
3:	learn: 0.5897033	total: 93.9ms	remaining: 2.25s
4:	learn: 0.5728385	total: 104ms	remaining: 1.98s
5:	learn: 0.5570585	total: 130ms	remaining: 2.03s
6:	learn: 0.5431224	total: 155ms	remaining: 2.06s
7:	learn: 0.5298011	total: 185ms	remaining: 2.13s
8:	learn: 0.5206983	total: 212ms	remaining: 2.14s
9:	learn: 0.5123542	total: 220ms	remaining: 1.98s
10:	learn: 0.5052597	total: 249ms	remaining: 2.01s
11:	learn: 0.4969067	total: 274ms	remaining: 2.01s
12:	learn: 0.4908292	total: 300ms	remaining: 2s
13:	learn: 0.4860390	total: 326ms	remaining: 2s
14:	learn: 0.4793273	total: 348ms	remaining: 1.97s
15:	learn: 0.4756839	total: 360ms	remaining: 1.89s
16:	learn: 0.4715258	total: 381ms	remaining: 1.86s
17:	learn: 0.4680979	total: 417ms	remaining: 1.9s
18:	learn: 0.4663680	total: 427ms	remaining: 1.82s
19:	learn: 0.4617820	total: 449ms	remaining: 1

<catboost.core.CatBoostClassifier at 0x150a1c8a510>

In [85]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
# Generate classification report for the best model
print(classification_report(y_test, y_pred))

Accuracy: 0.85
              precision    recall  f1-score   support

     Married       0.86      0.92      0.89       124
      Single       0.81      0.71      0.76        62

    accuracy                           0.85       186
   macro avg       0.84      0.81      0.82       186
weighted avg       0.85      0.85      0.85       186



In [86]:
# import pandas as pd
# from catboost import CatBoostClassifier
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.metrics import accuracy_score, classification_report

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize CatBoost Classifier
# model = CatBoostClassifier()

# # Define the hyperparameter grid for tuning
# param_grid = {
#     'iterations': [100, 200, 300],
#     'depth': [4, 6, 8],
#     'learning_rate': [0.05, 0.1, 0.2]
# }

# # Initialize GridSearchCV with CatBoost and the hyperparameter grid
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2)

# # Fit the GridSearchCV to find the best hyperparameters
# grid_search.fit(X_train, y_train, cat_features=list(X_train.select_dtypes(include=['object']).columns))

# # Get the best hyperparameters
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)

# # Use the best model from GridSearchCV
# best_model = grid_search.best_estimator_

# # Make predictions
# y_pred = best_model.predict(X_test)

# # Evaluate the best model
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy:.2f}')

# # Generate classification report for the best model
# print(classification_report(y_test, y_pred))

In [87]:
missing_marital_status = data[data['Marital_Status'].isnull()]
X_missing = missing_marital_status.drop('Marital_Status', axis=1)
imputed_marital_status = model.predict(X_missing)

In [88]:
data.loc[data['Marital_Status'].isnull(), 'Marital_Status'] = imputed_marital_status


In [89]:
data.head()

Unnamed: 0,Title,Gender,Religion_ID,Marital_Status,Designation_ID,Status,Reporting_emp_1,Reporting_emp_2,Employment_Category,Employment_Type,Religion,Designation,Year_of_Birth
0,Mr,Male,1,Married,24,Active,\N,\N,Staff,Permanant,Buddhist,Driver,1965
1,Mr,Male,1,Married,24,Active,\N,\N,Staff,Permanant,Buddhist,Driver,1973
2,Mr,Male,1,Married,23,Inactive,\N,\N,Staff,Permanant,Buddhist,Account Clerk,1974
3,Ms,Female,4,Married,40,Inactive,\N,\N,Staff,Permanant,Catholic,Purchasing Officer,1974
4,Mr,Male,1,Married,7,Active,\N,\N,Staff,Permanant,Buddhist,Store Keeper,1980


In [90]:
# print Marital_Status column
print(data['Marital_Status'])

0      Married
1      Married
2      Married
3      Married
4      Married
        ...   
992     Single
993     Single
994     Single
995     Single
996    Married
Name: Marital_Status, Length: 997, dtype: object


In [91]:
#print Marital_Status unique values
print(data['Marital_Status'].unique())

['Married' 'Single']


In [92]:
data.shape

(997, 13)

In [93]:
data_copy.shape

(997, 13)

In [96]:
# write data to csv file
data.to_csv('../Dataset/pre_processed/employees_Marital_Status_imputed.csv', index=False)
