In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Display the first few rows of the train and test datasets
print(train_df.head())
print(test_df.head())

df = pd.DataFrame(train_df)
df = pd.DataFrame(test_df)



In [None]:
# Check for missing values in train dataset
print(train_df.isnull().sum())

# Check for missing values in test dataset
print(test_df.isnull().sum())


In [None]:
# Combine the train and test datasets to apply transformations
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# List of categorical columns to encode
categorical_columns = combined_df.select_dtypes(include=['object']).columns

# Apply Label Encoding to each categorical column
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    combined_df[column] = label_encoders[column].fit_transform(combined_df[column])

# Split the combined dataset back into train and test datasets
train_df = combined_df[:len(train_df)]
test_df = combined_df[len(train_df):]


In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the train data and transform both train and test data
X_train_scaled = scaler.fit_transform(train_df.drop(['Attrition'], axis=1))
X_test_scaled = scaler.transform(test_df.drop(['Attrition'], axis=1))

y_train = train_df['Attrition']
y_test = test_df['Attrition']


In [None]:
# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Get the feature importances
feature_importances = pd.Series(rf.feature_importances_, index=train_df.drop(['Attrition'], axis=1).columns).sort_values(ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 8))
feature_importances.head(20).plot(kind='barh')
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title('Top 20 Important Features')
plt.show()

# Selecting the top 10 important features
top_features = feature_importances.head(10).index.tolist()

# Selecting top features for training and testing
X_train_selected = train_df[top_features]
X_test_selected = test_df[top_features]


In [None]:
# Train a Random Forest Classifier with selected features
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_selected, y_train)

# Predict on the test set
y_pred_class = rf_classifier.predict(X_test_selected)

# Evaluate the classification model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_class))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_class))

print("\nAccuracy Score:", accuracy_score(y_test, y_pred_class))
print("\nROC AUC Score:", roc_auc_score(y_test, rf_classifier.predict_proba(X_test_selected)[:, 1]))


In [None]:
# Train a Random Forest Regressor with selected features
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_selected, y_train)

# Predict on the test set
y_pred_reg = rf_regressor.predict(X_test_selected)

# Evaluate the regression model
mse = mean_squared_error(y_test, y_pred_reg)
r2 = r2_score(y_test, y_pred_reg)

print("\nMean Squared Error:", mse)
print("R-squared Score:", r2)
