In [None]:
# Import necessary libraries 
from google.colab import files
import io
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# ---------------------------------------------
# Step 1: Upload Dataset
# ---------------------------------------------
uploaded = files.upload()

# Load the dataset into a DataFrame
for fn in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[fn]))
    print(f"Loaded file: {fn}")

# ---------------------------------------------
# Step 2: Data Overview
# ---------------------------------------------
print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nFirst 5 Rows:\n", df.head())
print("\nMissing Values:\n", df.isnull().sum())
print()  # <-- Space after missing values

# ---------------------------------------------
# Step 3: Exploratory Data Analysis
# ---------------------------------------------

# 3.1 Attrition Count
plt.figure(figsize=(6, 4))
sns.countplot(x='Attrition', data=df)
plt.title('Attrition Count')
plt.tight_layout()
plt.show()
print()  # <-- Space after Attrition Count plot

# 3.2 Correlation Heatmap for Numeric Features
plt.figure(figsize=(12, 10))
numeric_df = df.select_dtypes(include='number')
sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# 3.3 Monthly Income by Attrition
print("\nAverage Monthly Income by Attrition:\n", df.groupby('Attrition')['MonthlyIncome'].mean())
print()  # <-- Space after Monthly Income output

# 3.4 Distributions of Key Features
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Age Distribution
sns.histplot(df['Age'], bins=30, kde=False, color='skyblue', ax=axes[0])
axes[0].set_title('Distribution of Employees by Age')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Count')

# Space between plots
print("\n")  # <-- Space between Age and Gender

# Gender Distribution
sns.countplot(x='Gender', data=df, palette='Set2', ax=axes[1])
axes[1].set_title('Distribution of Employees by Gender')
axes[1].set_xlabel('Gender')
axes[1].set_ylabel('Count')

# Space between plots
print("\n")  # <-- Space between Gender and Department

# Department Distribution
sns.countplot(x='Department', data=df, palette='Set3', ax=axes[2])
axes[2].set_title('Distribution of Employees by Department')
axes[2].set_xlabel('Department')
axes[2].set_ylabel('Count')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()
print()  # <-- Space after all distributions

# ---------------------------------------------
# Step 4: Data Preprocessing
# ---------------------------------------------
df_clean = df.copy()

# Encode categorical variables
le = LabelEncoder()
for col in df_clean.select_dtypes(include='object').columns:
    df_clean[col] = le.fit_transform(df_clean[col])

# Drop irrelevant columns
columns_to_drop = ['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber']
df_clean.drop(columns=columns_to_drop, axis=1, inplace=True)

# ---------------------------------------------
# Step 5: Train-Test Split
# ---------------------------------------------
X = df_clean.drop('Attrition', axis=1)
y = df_clean['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------------------------
# Step 6: Model Training with Random Forest
# ---------------------------------------------
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# ---------------------------------------------
# Step 7: Model Evaluation
# ---------------------------------------------
y_pred = model.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print()  # <-- Space after classification report
print("Model Accuracy:", accuracy_score(y_test, y_pred))

# ---------------------------------------------
# Step 8: Feature Importance Visualization
# ---------------------------------------------
importances = pd.Series(model.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh', color='teal')
plt.title("Top 10 Important Features")
plt.tight_layout()
plt.show()
