### Importing Packaging Liberaries

In [1]:
import numpy as np
import pandas as pd

### Importing Data

In [11]:
df = pd.read_csv("HR_Analytics.csv")
df.head(10)

Unnamed: 0,EmpID,Age,AgeGroup,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,RM297,18,18-25,Yes,Travel_Rarely,230,Research & Development,3,3,Life Sciences,...,3,80,0,0,2,3,0,0,0,0.0
1,RM302,18,18-25,No,Travel_Rarely,812,Sales,10,3,Medical,...,1,80,0,0,2,3,0,0,0,0.0
2,RM458,18,18-25,Yes,Travel_Frequently,1306,Sales,5,3,Marketing,...,4,80,0,0,3,3,0,0,0,0.0
3,RM728,18,18-25,No,Non-Travel,287,Research & Development,5,2,Life Sciences,...,4,80,0,0,2,3,0,0,0,0.0
4,RM829,18,18-25,Yes,Non-Travel,247,Research & Development,8,1,Medical,...,4,80,0,0,0,3,0,0,0,0.0
5,RM973,18,18-25,No,Non-Travel,1124,Research & Development,1,3,Life Sciences,...,3,80,0,0,5,4,0,0,0,0.0
6,RM1154,18,18-25,Yes,Travel_Frequently,544,Sales,3,2,Medical,...,3,80,0,0,2,4,0,0,0,0.0
7,RM1312,18,18-25,No,Non-Travel,1431,Research & Development,14,3,Medical,...,3,80,0,0,4,1,0,0,0,0.0
8,RM128,19,18-25,Yes,Travel_Rarely,528,Sales,22,1,Marketing,...,4,80,0,0,2,2,0,0,0,0.0
9,RM150,19,18-25,No,Travel_Rarely,1181,Research & Development,3,1,Medical,...,4,80,0,1,3,3,1,0,0,0.0


### Data Cleaning 

In [None]:
# Count missing values in each column
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Rename Columns
df = df.rename(columns={'MonthlyIncome': 'Salary'})
df = df.rename(columns={'PerformanceRating': 'Performance_Rating'})

# Save Clean Data
df.to_csv("Cleaned_HR_Analytics_Data.csv",index=False)

### Sampling From Dataset

In [10]:
df = pd.read_csv("Cleaned_HR_Analytics_Data.csv")
# Simple Random Sampling: select 15% of all employees
sample_random = df.sample(frac=0.15, random_state=42)

print("Random sample shape:", sample_random.shape)
sample_random.head()

Random sample shape: (222, 38)


Unnamed: 0,EmpID,Age,AgeGroup,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
661,RM057,35,26-35,No,Travel_Frequently,853,Sales,18,5,Life Sciences,...,4,80,1,9,3,2,9,8,1,8.0
274,RM344,29,26-35,No,Travel_Rarely,144,Sales,10,1,Marketing,...,1,80,2,7,2,3,7,7,1,7.0
394,RM133,31,26-35,Yes,Travel_Rarely,542,Sales,20,3,Life Sciences,...,3,80,1,4,2,3,2,2,2,2.0
218,RM290,28,26-35,No,Travel_Rarely,1117,Research & Development,8,2,Life Sciences,...,4,80,0,5,3,3,5,3,0,2.0
922,RM527,39,36-45,No,Travel_Rarely,408,Research & Development,2,4,Technical Degree,...,1,80,0,20,4,3,20,7,11,10.0


In [11]:
# Stratified Sampling: keep department proportions
sample_stratified = (
    df.groupby("Department", group_keys=False)
      .apply(lambda x: x.sample(frac=0.15, random_state=42))
)

print("Stratified sample shape:", sample_stratified.shape)
sample_stratified['Department'].value_counts(normalize=True)   # check proportions


Stratified sample shape: (222, 38)


  .apply(lambda x: x.sample(frac=0.15, random_state=42))


Department
Research & Development    0.653153
Sales                     0.306306
Human Resources           0.040541
Name: proportion, dtype: float64

In [12]:
# Sort by EmployeeNumber (or by index) to keep a consistent order
df_sorted = df.sort_values(by="EmployeeNumber")   # use correct ID column name
sample_systematic = df_sorted.iloc[::7, :]

print("Systematic sample shape:", sample_systematic.shape)
sample_systematic.head()


Systematic sample shape: (212, 38)


Unnamed: 0,EmpID,Age,AgeGroup,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1013,RM001,41,36-45,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,...,1,80,0,8,0,1,6,4,0,5.0
329,RM008,30,26-35,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,...,2,80,1,1,2,3,1,0,0,
212,RM015,28,26-35,Yes,Travel_Rarely,103,Research & Development,24,3,Life Sciences,...,2,80,0,6,4,3,4,2,0,3.0
735,RM022,36,36-45,Yes,Travel_Rarely,1218,Sales,9,4,Life Sciences,...,2,80,0,10,4,3,5,3,0,3.0
1131,RM029,44,36-45,No,Travel_Rarely,477,Research & Development,7,4,Medical,...,4,80,1,24,4,3,22,6,5,


In [13]:
# Full-population mean
mean_population = df['Salary'].mean()

# Means of each sample
mean_random      = sample_random['Salary'].mean()
mean_stratified  = sample_stratified['Salary'].mean()
mean_systematic  = sample_systematic['Salary'].mean()

print("Mean salary – full population :", round(mean_population,2))
print("Mean salary – random sample    :", round(mean_random,2))
print("Mean salary – stratified sample:", round(mean_stratified,2))
print("Mean salary – systematic sample:", round(mean_systematic,2))


Mean salary – full population : 6504.99
Mean salary – random sample    : 6603.0
Mean salary – stratified sample: 6671.13
Mean salary – systematic sample: 6692.85


### Calculate mean, median, and mode for Monthly_Income


Find the department with the highest median MonthlyIncome.

In [63]:
# Mean of Salary
Mean_Salary = np.mean(df['Salary'])
print("Mean of the Salary ",round(Mean_Salary,2))

# Median of Salary
Median_Salary = np.median(df['Salary'])
print("Median of the Salary ",round(Median_Salary,2))


#Mode of Salary
from scipy import stats

mode_Salary = stats.mode(df['Salary'],keepdims=True)
print("Mode:", mode_Salary.mode[:])
print("Frequency:", mode_Salary.count[0])



Mean of the Salary  6504.99
Median of the Salary  4933.0
Mode: [2342]
Frequency: 4


#### Calculate mean, median, and mode for Age.


In [62]:
# Calculation of Mean
Mean_Age = np.mean(df['Age'])
print("Mean Of the Age",round(Mean_Age,2))


# Calculation of Median
Median_Age = np.median(df['Age'])
print("Median Of the Age",(Median_Age))


# Calculation of Mood 
Mode_Age = stats.mode(df['Age'],keepdims=True)
print("Mode Of the Age",Mode_Age.mode[:])

Mean Of the Age 36.92
Median Of the Age 36.0
Mode Of the Age [34]


#### Calculate mean, median, and mode for PerformanceRating.

In [61]:
# Calculation of Mean
Mean_PerformanceRating = np.mean(df['Performance_Rating'])
print("Mean Of the ",round(Mean_PerformanceRating,2))


# Calculation of Median
Median_Performance = np.median(df['Performance_Rating'])
print(round(Median_Performance,2))


# Calculation of Mood 
Mode_PerformanceRating = stats.mode(df['Performance_Rating'],keepdims=True)
print(Mode_Age.mode[:])

Mean Of the  3.15
3.0
[3]


### Compare mean vs. median salaries – interpret if the distribution is skewed.


In [66]:
# Calculate mean and median
mean_salary = df['Salary'].mean()
median_salary = df['Salary'].median()

print("Mean salary:", round(mean_salary, 2))
print("Median salary:", round(median_salary, 2))

# Interpret skewness
if mean_salary > median_salary:
    print("The distribution is RIGHT-SKEWED (positively skewed) – a few high salaries raise the mean.")
elif mean_salary < median_salary:
    print("The distribution is LEFT-SKEWED (negatively skewed) – a few low salaries lower the mean.")
else:
    print("The distribution is approximately SYMMETRIC.")


Mean salary: 6504.99
Median salary: 4933.0
The distribution is RIGHT-SKEWED (positively skewed) – a few high salaries raise the mean.


### Find the department with the highest median MonthlyIncome.

In [71]:
# Median MonthlyIncome by department
dept_median_income = df.groupby('Department')['Salary'].median()

print("\nMedian MonthlyIncome by Department:")
print(dept_median_income)

# Department with the highest median MonthlyIncome
highest_median_dept = dept_median_income.idxmax()
highest_median_value = dept_median_income.max()

print(f"\nDepartment with highest median MonthlyIncome: {highest_median_dept} "
    f"with a median income of {highest_median_value}")



Median MonthlyIncome by Department:
Department
Human Resources           3886.0
Research & Development    4377.0
Sales                     5754.5
Name: Salary, dtype: float64

Department with highest median MonthlyIncome: Sales with a median income of 5754.5


### Calculate the range, variance, standard deviation, and IQR of MonthlyIncome.