In [31]:
# %% [markdown]
# ## 1. Import Libraries and Set Up MySQL Connection
#
# Import the required libraries and set up the MySQL connection using SQLAlchemy.

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# MySQL connection details
username = 'root'
password = '12345'
host = 'localhost'
port = '3306'
database = 'case3'
engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

print("MySQL engine created successfully!")


MySQL engine created successfully!


In [32]:
# %% [markdown]
# ## 2. Read CSV Files
#
# Load the three CSV files: Employee Data, Attrition Data, and Employee Performance Data.
# Note: In the employee data, we ignore the lower-case 'gender' column and keep the 'Gender' column.

# Read CSV files
employee_df = pd.read_csv('employee_data 1.csv')
attrition_df = pd.read_csv('Attrition 1.csv')
performance_df = pd.read_csv('employee_performance_data 1.csv')

# Display initial shapes
print("Employee Data Shape:", employee_df.shape)
print("Attrition Data Shape:", attrition_df.shape)
print("Performance Data Shape:", performance_df.shape)

# Drop lower-case 'gender' column if it exists; keep only the 'Gender' column.
if 'gender' in employee_df.columns:
    employee_df = employee_df.drop(columns=['gender'])
print("Columns in employee_df after dropping 'gender':", employee_df.columns.tolist())


Employee Data Shape: (1000, 12)
Attrition Data Shape: (144, 3)
Performance Data Shape: (1000, 6)
Columns in employee_df after dropping 'gender': ['Employee_ID', 'Age', 'first_name', 'last_name', 'Gender', 'Department', 'Job_Role', 'Education_Level', 'Marital_Status', 'Job_Tenure', 'Distance_From_Home']


In [33]:
# %% [markdown]
# ## 3. Validate Primary Key Uniqueness
#
# Ensure that Employee_ID is unique in each dataset for data integrity.

# Check uniqueness in employee data
if employee_df['Employee_ID'].is_unique:
    print("Employee_ID is unique in employee_data.")
else:
    print("Employee_ID has duplicates in employee_data.")

# Standardize column name in attrition data and check uniqueness
attrition_df.rename(columns={'employee_ID': 'Employee_ID'}, inplace=True)
if attrition_df['Employee_ID'].is_unique:
    print("Employee_ID is unique in attrition_data.")
else:
    print("Employee_ID has duplicates in attrition_data.")

# Check uniqueness in performance data
if performance_df['Employee_ID'].is_unique:
    print("Employee_ID is unique in performance_data.")
else:
    print("Employee_ID has duplicates in performance_data.")


Employee_ID is unique in employee_data.
Employee_ID has duplicates in attrition_data.
Employee_ID is unique in performance_data.


In [34]:
# %% [markdown]
# ## 4. Merge Datasets and Remove Incomplete Records
#
# Merge the employee and performance data first, then join with attrition data.
# Finally, drop rows missing either `attrition` or `Exit_Interview_Score` since these records aren’t useful for analysis.

# Merge employee and performance data on Employee_ID (inner join)
emp_perf_df = pd.merge(employee_df, performance_df, on='Employee_ID', how='inner')
print("Shape after merging employee and performance data:", emp_perf_df.shape)

# Merge with attrition data (left join)
full_df = pd.merge(emp_perf_df, attrition_df, on='Employee_ID', how='left')
print("Shape before dropping incomplete records:", full_df.shape)

# Drop rows with missing values for 'attrition' or 'Exit_Interview_Score'
full_df = full_df.dropna(subset=['attrition', 'Exit_Interview_Score'])
print("Shape after dropping rows with missing attrition or exit interview score:", full_df.shape)


Shape after merging employee and performance data: (1000, 16)
Shape before dropping incomplete records: (1017, 18)
Shape after dropping rows with missing attrition or exit interview score: (144, 18)


In [35]:
# %% [markdown]
# ## 5. Build Star Schema without dim_time and with Separate Department/Role Dimensions
#
# Create the fact table and dimension tables. The changes include:
#
# - **Fact Table (`fact_employee_performance`):** Contains performance metrics, attrition, exit interview score, and references to department and role via their surrogate keys.
# - **Employee Dimension (`dim_employee`):** Contains personal attributes without department or job role.
# - **Department Dimension (`dim_department`):** Contains unique departments with a surrogate key.
# - **Role Dimension (`dim_role`):** Contains unique job roles with a surrogate key.
#
# The fact table is updated to merge department and role IDs from their respective dimension tables.

# Create Fact Table with performance metrics and attrition data
fact_table = full_df[['Employee_ID', 'Performance_Rating', 'Last_Promotion_Year', 
                        'Training_Hours', 'Work_Life_Balance', 'Job_Satisfaction', 
                        'attrition', 'Exit_Interview_Score']].copy()

# Create Employee Dimension (exclude department and job role)
dim_employee = full_df[['Employee_ID', 'Age', 'first_name', 'last_name', 'Gender', 
                          'Education_Level', 'Marital_Status', 'Job_Tenure', 'Distance_From_Home']].drop_duplicates()

# Merge first and last names into a single column 'name'
dim_employee['Name'] = dim_employee['first_name'] + ' ' + dim_employee['last_name']
dim_employee = dim_employee.drop(columns=['first_name', 'last_name'])

print("Employee Dimension Shape:", dim_employee.shape)

# Create Department Dimension: Unique departments with surrogate key
dim_department = full_df[['Department']].drop_duplicates().reset_index(drop=True)
dim_department['Department_ID'] = dim_department.index + 1
print("Department Dimension Shape:", dim_department.shape)

# Create Role Dimension: Unique job roles with surrogate key
dim_role = full_df[['Job_Role']].drop_duplicates().reset_index(drop=True)
dim_role['Role_ID'] = dim_role.index + 1
print("Role Dimension Shape:", dim_role.shape)

# Merge Department and Role info into fact table:
# First, add the original department and job role columns to fact table for the lookup.
fact_table = pd.merge(fact_table, full_df[['Employee_ID', 'Department', 'Job_Role']], on='Employee_ID', how='left')

# Merge department ID from dim_department
fact_table = pd.merge(fact_table, dim_department, on='Department', how='left')

# Merge role ID from dim_role
fact_table = pd.merge(fact_table, dim_role, on='Job_Role', how='left')

# Remove redundant text columns (Department and Job_Role) after merging IDs
fact_table.drop(columns=['Department', 'Job_Role'], inplace=True)

print("Fact Table Shape:", fact_table.shape)


Employee Dimension Shape: (127, 8)
Department Dimension Shape: (6, 2)
Role Dimension Shape: (6, 2)
Fact Table Shape: (178, 10)


In [36]:
# %% [markdown]
# ## 6. Load Tables to MySQL
#
# Finally, load the fact and dimension tables into MySQL.
# Table names are converted to lower-case during the load process.

# Load tables into MySQL
fact_table.to_sql('fact_employee_performance', con=engine, if_exists='replace', index=False)
dim_employee.to_sql('dim_employee', con=engine, if_exists='replace', index=False)
dim_department.to_sql('dim_department', con=engine, if_exists='replace', index=False)
dim_role.to_sql('dim_role', con=engine, if_exists='replace', index=False)

print("Data loaded to MySQL successfully.")


Data loaded to MySQL successfully.
