In [None]:
# File: data_prep_and_modeling
# -----------------------------
# Objective:
# - Prepare the dataset for machine learning by encoding, sclaing, and handling class imbalance.
# - Split the data into training and testing sets.
# - Lay the groundwork for predictive modeling.

In [2]:
# Import libraries
import pandas as pd

In [3]:
# Verify data

# Load engineered data
data = pd.read_csv('../data/engineered_dataset.csv')
print("Dataset Loaded. Shape:", data.shape)



Dataset Loaded. Shape: (1470, 36)


In [7]:
# Triple check for missing values
print("Missing Values:")
print(data.isnull().sum())

# Refresh yourself on data structure
print("\nDataset Information:")
print(data.info())

# Refresh on the head
print("\nFirst five rows of the dataset: ")
print(data.head())

Missing Values:
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
RoleStability               0
OverTime_Binary         

In [None]:
# Dataset Verification: Summary and Findings
# -------------------------------------------
# Overview:
# - The dataset contains 1470 rows and 36 columns.
# - No missing values were found, indicating a clean dataset.
# - Feature engineering has been successfully applied, and new features 
#   like RoleStability, OT_WorkLifeImpact, SeniorityImpact, and SatisfactionBalance are present.

# Data Types:
# - Numerical Features: 26 columns.
# - Categorical Features: 7 columns.

# Observations:
# - The dataset is ready for preprocessing steps such as encoding, scaling, 
#   and addressing class imbalance.
# - Categorical variables will require one-hot encoding for compatibility with machine learning models.

# Next Steps:
# 1. Encode Categorical Variables:
#    - Convert categorical columns (e.g., BusinessTravel, Department) into numerical 
#      representations using one-hot encoding.
#    - Ensure the dataset structure is preserved, and new columns are created as expected.
# 2. Scale Numerical Features:
#    - Normalize numerical columns to bring them into a consistent range.
# 3. Address Class Imbalance:
#    - Handle the imbalance in Attrition using oversampling techniques like SMOTE.
# 4. Split Dataset:
#    - Divide the dataset into training and testing sets to prepare for modeling.
