<a href="https://colab.research.google.com/github/SuvarshaChennareddy/EDA-Theory-Assignment/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploratory Data Analysis (EDA) on employee_promotion.csv

Data Loading and Initial Exploration \
Data Cleaning and Handling \
Univariate, Bivariate, and Multivariate Analysis \
Dimensionality Reduction \
Model Development and Evaluation

## Module 2: Data Transformation

Load and Basic Exploration of Dataset

In [45]:
import pandas as pd

# Load the dataset
data = pd.read_csv('employee_promotion.csv')

# Basic info and statistics
print(data.info())
print(data.describe())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  object 
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  awards_won            54807 non-null  float64
 11  avg_training_score    52247 non-null  float64
 12  is_promoted           54808 non-null  object 
dtypes: float64(3), int64(3), object(7)
memory usage: 5.4+ MB
None
        employee_id  no_of_trainings  previous_year_rating 

Identify Missing Values

In [46]:
# Check for missing values
print(data.isnull().sum())

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
awards_won                 1
avg_training_score      2561
is_promoted                0
dtype: int64


Fill Missing Values

In [47]:
columns_with_missing_values = data.columns[data.isnull().any()]
for column in columns_with_missing_values:
    print(f"Unique values in '{column}': {data[column].unique()}")

Unique values in 'education': ["Master's & above" "Bachelor's" nan '?' 'Below Secondary']
Unique values in 'previous_year_rating': [ 5.  3.  1.  4. nan  2.]
Unique values in 'awards_won': [ 0.  1. nan]
Unique values in 'avg_training_score': [49. 60. 50. 73. 85. 59. 63. 83. 54. 77. 80. 84. 51. 46. 75. 57. 70. 68.
 79. 44. 72. nan 48. 58. 87. 47. 52. 88. 71. 65. 62. 53. 78. 91. 82. 69.
 55. 74. 86. 90. 92. 67. 89. 56. 76. 81. 64. 39. 94. 93. 66. 95. 42. 96.
 40. 99. 43. 97. 41. 98.]


In [48]:
# Fill numeric columns with a KNN imputer
from sklearn.impute import KNNImputer

# Select only numeric columns
numeric_cols = data.select_dtypes(include=['number']).columns
numeric_data = data[numeric_cols]

# Initialize the KNN Imputer with a specified number of neighbors
imputer = KNNImputer(n_neighbors=5)  # You can adjust the number of neighbors

# Fit and transform the data
data[numeric_cols] = imputer.fit_transform(numeric_data)

# Check the result
print(data[numeric_cols].head())

   employee_id  no_of_trainings  previous_year_rating  length_of_service  \
0      65438.0              1.0                   5.0                8.0   
1      65141.0              1.0                   5.0                4.0   
2       7513.0              1.0                   3.0                7.0   
3       2542.0              2.0                   1.0               10.0   
4      48945.0              1.0                   3.0                2.0   

   awards_won  avg_training_score  
0         0.0                49.0  
1         0.0                60.0  
2         0.0                50.0  
3         0.0                50.0  
4         0.0                73.0  


In [49]:
# Fill categorical columns with '?'

data[['education']] = data[['education']].fillna('?')

print(data.head())

   employee_id         department     region         education gender  \
0      65438.0  Sales & Marketing   region_7  Master's & above      f   
1      65141.0         Operations  region_22        Bachelor's      m   
2       7513.0  Sales & Marketing  region_19        Bachelor's      m   
3       2542.0  Sales & Marketing  region_23        Bachelor's      m   
4      48945.0         Technology  region_26        Bachelor's      m   

  recruitment_channel  no_of_trainings age  previous_year_rating  \
0            sourcing              1.0  35                   5.0   
1               other              1.0  30                   5.0   
2            sourcing              1.0  34                   3.0   
3               other              2.0  39                   1.0   
4               other              1.0  45                   3.0   

   length_of_service  awards_won  avg_training_score is_promoted  
0                8.0         0.0                49.0           0  
1                4

In [50]:
print(data.isnull().sum())

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
awards_won              0
avg_training_score      0
is_promoted             0
dtype: int64


Replace and Transform Values

In [51]:
# Define the ordinal scale for the education column
education_order = {
    "?": 0,
    "Below Secondary": 1,
    "Bachelor's": 2,
    "Master's & above": 3,
}

# Apply the transformation
data['education'] = data['education'].map(education_order)
print("Ordinal values for 'education' column: ", data['education'].unique())

Ordinal values for 'education' column:  [3 2 0 1]


Standardize Numeric Data

In [52]:
from sklearn.preprocessing import StandardScaler

# Create a copy of the original dataframe
data_scaled = data.copy()

# Select numeric columns
numeric_cols = data_scaled.select_dtypes(include=['float64', 'int64']).columns

# Initialize the scaler
scaler = StandardScaler()

# Scale the numeric columns in the copied dataframe
data_scaled[numeric_cols] = scaler.fit_transform(data_scaled[numeric_cols])

# Verify the changes
print(data_scaled[numeric_cols].head())

   employee_id  education  no_of_trainings  previous_year_rating  \
0     1.161858   1.260233        -0.415276              1.367780   
1     1.148709  -0.257541        -0.415276              1.367780   
2    -1.402741  -0.257541        -0.415276             -0.268956   
3    -1.622829  -0.257541         1.226063             -1.905692   
4     0.431639  -0.257541        -0.415276             -0.268956   

   length_of_service  awards_won  avg_training_score  
0           0.500460   -0.154018           -1.109970  
1          -0.437395   -0.154018           -0.280989  
2           0.265996   -0.154018           -1.034608  
3           0.969387   -0.154018           -1.034608  
4          -0.906322   -0.154018            0.698715  
