In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [7]:
hrdata=pd.read_csv('https://raw.githubusercontent.com/tkseneee/Dataset/dd7313e0c6487acd9ed7cd32b786782c7a1d3885/HR_data.csv')

In [8]:
hrdata.head(2)

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Gender,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [9]:
# Check for missing values
missing_values = hrdata.isnull().sum()

print("Missing values in each column:")
print(missing_values)

# Check if there are any missing values in the dataset
if missing_values.any():
    print("\nThere are missing values in the dataset.")
else:
    print("\nThere are no missing values in the dataset.")

Missing values in each column:
Age                  0
Workclass         2079
fnlwgt               0
Education            0
Education_Num        0
Martial_Status       0
Occupation        2087
Relationship         0
Race                 0
Gender               0
Capital_Gain         0
Capital_Loss         0
Hours_per_week       0
Country            656
Target               0
dtype: int64

There are missing values in the dataset.


In [10]:
# Keep only numeric columns
numeric_data = hrdata.select_dtypes(include=[np.number])

print("Numeric columns in the dataset:")
print(numeric_data.head(2))

Numeric columns in the dataset:
   Age  fnlwgt  Education_Num  Capital_Gain  Capital_Loss  Hours_per_week
0   39   77516             13          2174             0              40
1   50   83311             13             0             0              13


In [11]:
# Compute the range for each numeric column
range_values = numeric_data.max() - numeric_data.min()

print("Range of each numeric column:")
print(range_values)

# Comment on the need for scaling
print("\nComment:")
if (range_values.max() - range_values.min()) > 10:  # Arbitrary threshold for significant difference
    print("The ranges of the numeric columns vary significantly. Scaling is necessary to ensure all features contribute equally to the model.")
else:
    print("The ranges of the numeric columns do not vary significantly. Scaling may not be strictly necessary.")

Range of each numeric column:
Age                    73
fnlwgt            1472420
Education_Num          15
Capital_Gain        99999
Capital_Loss         4356
Hours_per_week         98
dtype: int64

Comment:
The ranges of the numeric columns vary significantly. Scaling is necessary to ensure all features contribute equally to the model.


In [14]:
# Calculate mean and standard deviation of the 'Area' column
mean_area = hrdata['Age'].mean()
std_area = hrdata['Age'].std()

# Perform standard scaling
hrdata['Age'] = (hrdata['Age'] - mean_area) / std_area

# Display the first few rows of the dataset with the scaled 'Area' column
print("First few rows of the dataset with scaled 'Area' column:")
print(hrdata[['Age', 'Age']].head())

First few rows of the dataset with scaled 'Area' column:
        Age       Age
0  0.029050  0.029050
1  0.834352  0.834352
2 -0.044160 -0.044160
3  1.053980  1.053980
4 -0.776253 -0.776253


In [15]:
# Calculate mean and standard deviation of the 'Age' column
mean_age = hrdata['Age'].mean()
std_age = hrdata['Age'].std()

# Perform standard scaling
hrdata['Age_scaled'] = (hrdata['Age'] - mean_age) / std_age

# Display the first few rows of the dataset with the scaled 'Age' column
print("First few rows of the dataset with scaled 'Age' column:")
print(hrdata[['Age', 'Age_scaled']].head())

# Calculate min, max, mean, and standard deviation of the 'Age' column
min_age = hrdata['Age'].min()
max_age = hrdata['Age'].max()

print("\nStatistics of the 'Age' column:")
print("Minimum age:", min_age)
print("Maximum age:", max_age)
print("Mean age:", mean_age)
print("Standard deviation of age:", std_age)

First few rows of the dataset with scaled 'Age' column:
        Age  Age_scaled
0  0.029050    0.029050
1  0.834352    0.834352
2 -0.044160   -0.044160
3  1.053980    1.053980
4 -0.776253   -0.776253

Statistics of the 'Age' column:
Minimum age: -1.5815552128571506
Maximum age: 3.762724485511142
Mean age: 4.843681145570809e-17
Standard deviation of age: 1.0


In [16]:
# Check mean and standard deviation of the scaled 'Age' column
mean_scaled_age = hrdata['Age_scaled'].mean()
std_scaled_age = hrdata['Age_scaled'].std()

# Check if mean is approximately 0 and standard deviation is approximately 1
if np.isclose(mean_scaled_age, 0) and np.isclose(std_scaled_age, 1):
    print("Mean of scaled 'Age' column is approximately 0 and standard deviation is approximately 1.")
else:
    print("Mean and/or standard deviation of scaled 'Age' column is not approximately 0 and/or 1.")

# You can repeat the above steps for other scaled columns if necessary

Mean of scaled 'Age' column is approximately 0 and standard deviation is approximately 1.


In [18]:
# Invert the scaled data back to the original form and print in the specified format
for column in hrdata.columns:
    if column.endswith('_scaled'):
        original_column = column[:-7]  # Remove '_scaled' suffix
        std = hrdata[original_column].std()
        mean = hrdata[original_column].mean()
        hrdata[original_column + '_original'] = (hrdata[column] * std) + mean
        print(f"{original_column:<30} {hrdata[original_column + '_original'].mean():<5} {hrdata[original_column + '_original'].std():<5}")

Age                            4.843681145570809e-17 1.0  


In [19]:
# Invert the scaled 'Age' column back to the original form
hrdata['Age_original'] = (hrdata['Age_scaled'] * std_age) + mean_age

# Display the first few rows of the dataset with the inverted 'Age' column
print("\nFirst few rows of the dataset with inverted 'Age' column:")
print(hrdata[['Age_scaled', 'Age_original']].head())


First few rows of the dataset with inverted 'Age' column:
   Age_scaled  Age_original
0    0.029050      0.029050
1    0.834352      0.834352
2   -0.044160     -0.044160
3    1.053980      1.053980
4   -0.776253     -0.776253


In [20]:
# Copy the original numeric data to a new DataFrame
numeric_data_minmax = hrdata.select_dtypes(include=[np.number]).copy()

# Calculate the minimum and maximum values for each column
min_values = numeric_data_minmax.min()
max_values = numeric_data_minmax.max()

# Apply Min-max scaling to each column
numeric_data_minmax = (numeric_data_minmax - min_values) / (max_values - min_values)

# Print the minimum and maximum values for all the columns after Min-max scaling
print("Min values after Min-max scaling:")
print(min_values)
print("\nMax values after Min-max scaling:")
print(max_values)

Min values after Min-max scaling:
Age                  -1.581555
fnlwgt            12285.000000
Education_Num         1.000000
Capital_Gain          0.000000
Capital_Loss          0.000000
Hours_per_week        1.000000
Age_scaled           -1.581555
Age_original         -1.581555
dtype: float64

Max values after Min-max scaling:
Age               3.762724e+00
fnlwgt            1.484705e+06
Education_Num     1.600000e+01
Capital_Gain      9.999900e+04
Capital_Loss      4.356000e+03
Hours_per_week    9.900000e+01
Age_scaled        3.762724e+00
Age_original      3.762724e+00
dtype: float64


In [21]:
from sklearn.preprocessing import RobustScaler

# Initialize the RobustScaler
scaler = RobustScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(hrdata.select_dtypes(include=[np.number]))

# Convert the scaled data back to a DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=hrdata.select_dtypes(include=[np.number]).columns)

# Print the first few rows of the scaled data
print("First few rows of the scaled data:")
print(scaled_df.head())

# Print the maximum and minimum values for all the columns after scaling
print("\nMin values after Robust Scaler:")
print(scaled_df.min())
print("\nMax values after Robust Scaler:")
print(scaled_df.max())

First few rows of the scaled data:
    Age    fnlwgt  Education_Num  Capital_Gain  Capital_Loss  Hours_per_week  \
0  0.10 -0.841482       1.000000        2174.0           0.0             0.0   
1  0.65 -0.793073       1.000000           0.0           0.0            -5.4   
2  0.05  0.312399      -0.333333           0.0           0.0             0.0   
3  0.80  0.471744      -1.000000           0.0           0.0             0.0   
4 -0.45  1.337911       1.000000           0.0           0.0             0.0   

   Age_scaled  Age_original  
0        0.10          0.10  
1        0.65          0.65  
2        0.05          0.05  
3        0.80          0.80  
4       -0.45         -0.45  

Min values after Robust Scaler:
Age              -1.000000
fnlwgt           -1.386395
Education_Num    -3.000000
Capital_Gain      0.000000
Capital_Loss      0.000000
Hours_per_week   -7.800000
Age_scaled       -1.000000
Age_original     -1.000000
dtype: float64

Max values after Robust Scaler:
Age    