## Employee's Performance -EDA

### Importing necessary libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### --- 1. Data Loading

In [7]:
print("--- 1. Data Loading ---")
try:
    df = pd.read_csv(r"C:\Users\bagsu\Downloads\Test_data.csv")
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'Test_data.csv' not found. Please make sure the file is in the same directory as the script.")
    exit() # Exit if the file is not found


--- 1. Data Loading ---
Dataset loaded successfully.


### --- 2. Initial Data Inspection ---

In [11]:
print("\n--- 2. Initial Data Inspection ---")
print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nDataset Info:")
df.info()

print("\nSummary Statistics for Numerical Columns:")
print(df.describe())

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(include='object').columns

print("\nUnique Values and Counts for Categorical Columns:")
for column in categorical_cols:
    print(f"\nColumn: {column}")
    print(df[column].value_counts())


--- 2. Initial Data Inspection ---

First 5 rows of the dataset:
   employee_id         department     region   education gender  \
0         8724         Technology  region_26  Bachelor's      m   
1        74430                 HR   region_4  Bachelor's      f   
2        72255  Sales & Marketing  region_13  Bachelor's      m   
3        38562        Procurement   region_2  Bachelor's      f   
4        64486            Finance  region_29  Bachelor's      m   

  recruitment_channel  no_of_trainings  age  previous_year_rating  \
0            sourcing                1   24                   NaN   
1               other                1   31                   3.0   
2               other                1   31                   1.0   
3               other                3   31                   2.0   
4            sourcing                1   30                   4.0   

   length_of_service  KPIs_met >80%  awards_won?  avg_training_score  
0                  1              1          

### --- 3. Missing Value Analysis and Handling ---

In [21]:
print("\n--- 3. Missing Value Analysis and Handling ---")
print("\nMissing Values before imputation:")
print(df.isnull().sum())

# Visualize missing values (if any)
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.savefig('missing_values_heatmap.png') # Save the plot
plt.close() # Close the plot to free memory
print("Missing Values Heatmap saved as 'missing_values_heatmap.png'")

# Impute missing values in 'previous_year_rating' with the mode
if 'previous_year_rating' in df.columns and df['previous_year_rating'].isnull().any():
    mode_rating = df['previous_year_rating'].mode()[0]
    # UPDATED LINE: Assign the result back to the column
    df['previous_year_rating'] = df['previous_year_rating'].fillna(mode_rating)
    print(f"\nMissing values in 'previous_year_rating' imputed with mode: {mode_rating}")

print("\nMissing Values after imputation:")
print(df.isnull().sum())


--- 3. Missing Value Analysis and Handling ---

Missing Values before imputation:
employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating       0
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64
Missing Values Heatmap saved as 'missing_values_heatmap.png'

Missing Values after imputation:
employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating       0
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64


### --- 4. Univariate Visualizations (Distributions) ---

In [25]:
print("\n--- 4. Univariate Visualizations (Distributions) ---")

# Histograms for Numerical Features
print("Generating Histograms for Numerical Features...")
df[numerical_cols].hist(bins=30, figsize=(15, 10))
plt.suptitle('Histograms of Numerical Features', y=1.02)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('numerical_histograms.png')
plt.close()
print("Histograms saved as 'numerical_histograms.png'")


# Count Plots for Categorical Features
print("Generating Count Plots for Categorical Features...")
for i, col in enumerate(categorical_cols):
    plt.figure(figsize=(10, 6))
    # UPDATED LINE: Assign y to hue and set legend=False
    sns.countplot(y=col, data=df, order=df[col].value_counts().index, palette='viridis', hue=col, legend=False)
    plt.title(f'Count Plot for {col}')
    plt.xlabel('Count')
    plt.ylabel(col)
    plt.tight_layout()
    plt.savefig(f'count_plot_{col}.png')
    plt.close()
print(f"Count plots for all categorical features saved (e.g., 'count_plot_department.png').")



--- 4. Univariate Visualizations (Distributions) ---
Generating Histograms for Numerical Features...
Histograms saved as 'numerical_histograms.png'
Generating Count Plots for Categorical Features...
Count plots for all categorical features saved (e.g., 'count_plot_department.png').


### --- 5. Bivariate and Multivariate Analysis (Relationships) ---

In [28]:
print("\n--- 5. Bivariate and Multivariate Analysis (Relationships) ---")

# Correlation Matrix for Numerical Features
print("Generating Correlation Matrix...")
plt.figure(figsize=(12, 10))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Numerical Features')
plt.savefig('correlation_matrix.png')
plt.close()
print("Correlation Matrix saved as 'correlation_matrix.png'")

# Example: Relationship between a categorical and numerical feature (e.g., department and avg_training_score)
print("Generating Box Plot: Average Training Score by Department...")
plt.figure(figsize=(12, 7))
sns.boxplot(x='avg_training_score', y='department', data=df, palette='pastel')
plt.title('Average Training Score by Department')
plt.xlabel('Average Training Score')
plt.ylabel('Department')
plt.tight_layout()
plt.savefig('avg_training_score_by_department_boxplot.png')
plt.close()
print("Box Plot 'avg_training_score_by_department_boxplot.png' saved.")

# Example: Relationship between two categorical features (e.g., department and education)
print("Generating Count Plot: Department Distribution by Education Level...")
plt.figure(figsize=(15, 8))
sns.countplot(x='department', hue='education', data=df, palette='tab10')
plt.title('Department Distribution by Education Level')
plt.xlabel('Department')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Education')
plt.tight_layout()
plt.savefig('department_education_countplot.png')
plt.close()
print("Count Plot 'department_education_countplot.png' saved.")

# Example: Relationship with KPI_met >80% (assuming it's a key outcome)
if 'KPIs_met >80%' in df.columns:
    print("Generating Box Plot: Average Training Score by KPIs Met...")
    plt.figure(figsize=(8, 6))
    sns.boxplot(x='KPIs_met >80%', y='avg_training_score', data=df, palette='viridis')
    plt.title('Average Training Score by KPIs Met (>80%)')
    plt.xlabel('KPIs Met (>80%)')
    plt.ylabel('Average Training Score')
    plt.xticks(ticks=[0, 1], labels=['No', 'Yes'])
    plt.tight_layout()
    plt.savefig('avg_training_score_by_kpis_met_boxplot.png')
    plt.close()
    print("Box Plot 'avg_training_score_by_kpis_met_boxplot.png' saved.")

    print("Generating Count Plot: Department Distribution by KPIs Met...")
    plt.figure(figsize=(15, 8))
    sns.countplot(x='department', hue='KPIs_met >80%', data=df, palette='magma')
    plt.title('Department Distribution by KPIs Met (>80%)')
    plt.xlabel('Department')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='KPIs Met (>80%)', labels=['No', 'Yes'])
    plt.tight_layout()
    plt.savefig('department_kpis_met_countplot.png')
    plt.close()
    print("Count Plot 'department_kpis_met_countplot.png' saved.")

print("\nEDA process completed. All plots are saved in the current directory.")


--- 5. Bivariate and Multivariate Analysis (Relationships) ---
Generating Correlation Matrix...
Correlation Matrix saved as 'correlation_matrix.png'
Generating Box Plot: Average Training Score by Department...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='avg_training_score', y='department', data=df, palette='pastel')


Box Plot 'avg_training_score_by_department_boxplot.png' saved.
Generating Count Plot: Department Distribution by Education Level...
Count Plot 'department_education_countplot.png' saved.
Generating Box Plot: Average Training Score by KPIs Met...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='KPIs_met >80%', y='avg_training_score', data=df, palette='viridis')


Box Plot 'avg_training_score_by_kpis_met_boxplot.png' saved.
Generating Count Plot: Department Distribution by KPIs Met...
Count Plot 'department_kpis_met_countplot.png' saved.

EDA process completed. All plots are saved in the current directory.
