<a href="https://colab.research.google.com/github/ShaliniSettipalli/21BDS0160_EDA_SHALINI_SETTIPALLI/blob/main/Untitled54.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print("SHALINI SETTIPALLI_21BDS0160")
# Load the dataset
file_path = 'data.csv'
data = pd.read_csv(file_path)

# 1. Dataset Dimensions
print(f"Dimensions of the dataset: {data.shape}")

# 2. Summary of the dataset
print("Summary of the dataset:")
print(data.describe(include='all'))

# 3. Data Handling: Identifying missing values
print("Missing values in each column:")
print(data.isnull().sum())

# Replace non-numeric values like '?' with NaN
data.replace('?', np.nan, inplace=True)

# Convert columns that should be numeric to numeric types
numeric_cols = ['previous_year_rating', 'no_of_trainings', 'length_of_service', 'age', 'avg_training_score']
for col in numeric_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')  # Convert and coerce errors to NaN

# 4. Handling Missing Data:
# Fill missing numerical values with mean
for col in numeric_cols:
    data[col].fillna(data[col].mean(), inplace=True)

# Fill missing categorical values with mode
cat_cols = ['department', 'region', 'education', 'gender', 'recruitment_channel', 'is_promoted']
for col in cat_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# 5. Data Cleaning: Converting categorical columns to 'category' datatype
for col in cat_cols:
    data[col] = data[col].astype('category')

# 6. Univariate Analysis
# Numerical Features
print("Statistical summary of numerical features:")
print(data.describe())

# Plot distributions of numerical columns
data.select_dtypes(include=['float64', 'int64']).hist(bins=30, figsize=(12, 10))
plt.tight_layout()
plt.show()

# Categorical Features
print("Count of categorical features:")
for col in cat_cols:
    print(f"{col}:")
    print(data[col].value_counts())

# Plot countplots for categorical columns
for col in cat_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data[col])
    plt.title(f"Distribution of {col}")
    plt.show()

# 7. Bivariate Analysis
# Correlation Matrix for numerical variables
plt.figure(figsize=(10, 6))
sns.heatmap(data[numeric_cols].corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# Bivariate analysis for categorical features
# Promotion rate by department
pd.crosstab(data['department'], data['is_promoted']).plot(kind='bar', stacked=True, figsize=(8, 6))
plt.title('Promotions by Department')
plt.show()

# Boxplot for age by promotion status
plt.figure(figsize=(8, 6))
sns.boxplot(x='is_promoted', y='age', data=data)
plt.title('Age vs Promotion Status')
plt.show()

# 8. Multivariate Analysis
# Pairplot for numerical variables
sns.pairplot(data[['age', 'no_of_trainings', 'length_of_service', 'previous_year_rating', 'avg_training_score']])
plt.show()

# Heatmap for correlation of selected numerical features
plt.figure(figsize=(10, 8))
corr_matrix = data[['age', 'no_of_trainings', 'length_of_service', 'previous_year_rating', 'avg_training_score']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Heatmap of Selected Numerical Features')
plt.show()

# Multivariate: Promotions by Department and Gender
pd.crosstab([data['department'], data['gender']], data['is_promoted']).plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Promotions by Department and Gender')
plt.show()
