# Feature Engineering Mini Project â€” IBM HR Dataset

This notebook is prepared to run on Google Colab or local Jupyter. Follow cells sequentially.

In [None]:
# If running on Google Colab, uncomment and run one of the following blocks.
# Option A: Upload files directly to the session
# from google.colab import files
# uploaded = files.upload()  # then use the uploaded filename in the next cell

# Option B: Mount Google Drive and read from Drive
# from google.colab import drive
# drive.mount('/content/drive')
# df = pd.read_csv('/content/drive/MyDrive/path/to/WA_Fn-UseC_-HR-Employee-Attrition.csv')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Default path included in this project archive
csv_path = 'data/WA_Fn-UseC_-HR-Employee-Attrition.csv'
print('Attempting to load:', csv_path)
try:
    df = pd.read_csv(csv_path)
    print('Loaded from data folder, shape:', df.shape)
except Exception as e:
    print('Failed to load from data folder:', e)
    # Attempt to find uploaded files in Colab session
    import os
    for f in os.listdir('.'):
        if f.lower().endswith('.csv'):
            print('Found CSV in session:', f)
            df = pd.read_csv(f)
            break

# Quick peek
print(df.columns.tolist())
df.head()

In [None]:
# Example cleaning steps (run after loading df)
# Standardize column names
if 'df' in globals():
    df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
    df = df.drop_duplicates()
    if 'age' in df.columns:
        df.loc[df['age']>100,'age'] = np.nan
    # Impute numeric with median and categorical with mode
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    for c in num_cols:
        df[c].fillna(df[c].median(), inplace=True)
    for c in cat_cols:
        df[c].fillna(df[c].mode()[0], inplace=True)
    print('After cleaning shape:', df.shape)
else:
    print('Dataframe df not found. Load dataset first.')

df.head()

In [None]:
# Feature engineering examples
if 'df' in globals():
    # Create attrition flag
    if 'attrition' in df.columns:
        df['attrition_flag'] = df['attrition'].map({'Yes':1,'No':0})
    # One-hot encode a few categorical columns
    ohe_cols = [c for c in ['jobrole','department','maritalstatus','overtime','educationfield','gender'] if c in df.columns]
    if ohe_cols:
        df = pd.get_dummies(df, columns=ohe_cols, drop_first=True)
    # Log transform monthlyincome if present
    if 'monthlyincome' in df.columns:
        df['monthlyincome_log'] = np.log1p(df['monthlyincome'])
    print('After feature engineering, shape:', df.shape)
else:
    print('Dataframe df not found. Load dataset first.')

df.head()

In [None]:
# Visualizations examples
import matplotlib.pyplot as plt
if 'df' in globals():
    if 'attrition' in df.columns:
        counts = df['attrition'].value_counts()
        plt.figure(figsize=(6,4))
        plt.bar(counts.index.astype(str), counts.values)
        plt.title('Attrition Count')
        plt.show()
    if 'department' in df.columns and 'attrition' in df.columns:
        dept = df.groupby(['department','attrition']).size().unstack(fill_value=0)
        x = range(len(dept.index))
        plt.figure(figsize=(8,5))
        plt.bar([i-0.2 for i in x], dept.iloc[:,0], width=0.4)
        if dept.shape[1]>1:
            plt.bar([i+0.2 for i in x], dept.iloc[:,1], width=0.4)
        plt.xticks(x, dept.index, rotation=30)
        plt.title('Attrition by Department')
        plt.show()
    if 'age' in df.columns:
        plt.figure(figsize=(6,4))
        plt.hist(df['age'].dropna(), bins=20)
        plt.title('Age Distribution')
        plt.show()
else:
    print('Dataframe df not found. Load dataset first.')


In [None]:
# Save processed dataframe to CSV in the session (optional)
if 'df' in globals():
    out_path = 'feature_engineered_ibm_hr_colab.csv'
    df.to_csv(out_path, index=False)
    print('Saved processed CSV to', out_path)
else:
    print('df not found')
