In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [8]:
import pandas as pd

data = pd.read_csv("IBM_HR_Attrition.csv")



In [9]:
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)


In [10]:
cat_columns = data.select_dtypes(include=['object']).columns
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
encoded_data = pd.DataFrame(one_hot_encoder.fit_transform(data_imputed[cat_columns]))
encoded_data.columns = one_hot_encoder.get_feature_names(cat_columns)
data_imputed.drop(cat_columns, axis=1, inplace=True)
data_imputed = pd.concat([data_imputed, encoded_data], axis=1)



In [11]:
X = data_imputed.drop('Attrition_Yes', axis=1)
y = data_imputed['Attrition_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
rf_classifier = RandomForestClassifier(random_state=42)

rf_classifier.fit(X_train, y_train)

In [13]:
y_pred = rf_classifier.predict(X_test)

In [14]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [15]:
print("Random Forest Classifier Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Random Forest Classifier Performance:
Accuracy: 0.8775510204081632
Precision: 0.8
Recall: 0.10256410256410256
F1 Score: 0.18181818181818182
