Library

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier 

Read file csv

In [23]:
url = "https://raw.githubusercontent.com/Veicap/Assignment2-ML/refs/heads/main/cleaned_data_file.csv"
df = pd.read_csv(url)

In [25]:
label_encoder = LabelEncoder()
categorical_columns = ['department', 'salary']
df['department'] = label_encoder.fit_transform(df['department'])
df['salary'] = label_encoder.fit_transform(df['salary'])

print(df)

       satisfaction_level  last_evaluation  number_project  \
0                    0.38             0.53               2   
1                    0.80             0.86               5   
2                    0.11             0.88               7   
3                    0.72             0.87               5   
4                    0.37             0.52               2   
...                   ...              ...             ...   
12254                0.40             0.47               2   
12255                0.43             0.46               2   
12256                0.89             0.88               5   
12257                0.76             0.83               6   
12258                0.37             0.48               2   

       average_montly_hours  time_spend_company  work_accident  left  \
0                157.000000            3.000000              0     1   
1                262.000000            6.000000              0     1   
2                272.000000            

In [26]:
print(df.columns)

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident', 'left',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')


In [27]:
X = df.drop('left', axis=1)
y = df['left']

a. Split the data (85% training, 15% testing)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

X_train_shuffled, y_train_shuffled = X_train.values, y_train.values
indices = np.random.permutation(len(X_train))
X_train_shuffled = X_train_shuffled[indices]
y_train_shuffled = y_train_shuffled[indices]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

classifier = RandomForestClassifier(random_state=42)

cross_val_scores = cross_val_score(classifier, X_train_shuffled, y_train_shuffled, cv=kf)
print(f"Cross-validation scores: {cross_val_scores}")

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

Cross-validation scores: [0.97792706 0.98464491 0.98080614 0.98704415 0.98512476]
Confusion Matrix:
[[1527    2]
 [  32  278]]
Precision: 0.9928571428571429
F1 Score: 0.9423728813559322


b. Split the data (75% training, 25% testing)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train_shuffled, y_train_shuffled = X_train.values, y_train.values
indices = np.random.permutation(len(X_train))
X_train_shuffled = X_train_shuffled[indices]
y_train_shuffled = y_train_shuffled[indices]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

classifier = RandomForestClassifier(random_state=42)

cross_val_scores = cross_val_score(classifier, X_train_shuffled, y_train_shuffled, cv=kf)
print(f"Cross-validation scores: {cross_val_scores}")

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

Cross-validation scores: [0.98477433 0.98477433 0.98205546 0.98368679 0.98258977]
Confusion Matrix:
[[2532    2]
 [  52  479]]
Precision: 0.9958419958419958
F1 Score: 0.9466403162055336


c. Split the data (65% training, 35% testing)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)
X_train_shuffled, y_train_shuffled = X_train.values, y_train.values
indices = np.random.permutation(len(X_train))
X_train_shuffled = X_train_shuffled[indices]
y_train_shuffled = y_train_shuffled[indices]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

classifier = RandomForestClassifier(random_state=42)

cross_val_scores = cross_val_score(classifier, X_train_shuffled, y_train_shuffled, cv=kf)
print(f"Cross-validation scores: {cross_val_scores}")

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

Cross-validation scores: [0.98117942 0.98306148 0.97992472 0.98305085 0.98179535]
Confusion Matrix:
[[3537    9]
 [  64  681]]
Precision: 0.9869565217391304
F1 Score: 0.9491289198606272
