<a href="https://colab.research.google.com/github/Sedhupxthi/EDA-21BDS0072/blob/main/EDA_21BDS0072.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis

# Configure plotting styles
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)


In [None]:
# Load the dataset
file_path = 'BEPS.csv'
try:
    data = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: File not found. Please check the file path.")
    exit()

# Display the first few rows
print("\nFirst 5 rows of the dataset:")
print(data.head())


In [None]:
# Dimensions of the dataset
print("\nDataset Dimensions:")
print(f"Rows: {data.shape[0]}, Columns: {data.shape[1]}")

# Column data types and basic info
print("\nDataset Information:")
print(data.info())

# Summary Statistics
print("\nSummary Statistics:")
print(data.describe(include='all'))


In [None]:
# Check for missing values
print("\nMissing Values:")
missing_data = data.isnull().sum()
print(missing_data)

# Visualize missing values
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()


In [None]:
# Check for duplicate rows
duplicates = data.duplicated().sum()
print(f"\nDuplicate Rows: {duplicates}")

# Remove duplicates if any
if duplicates > 0:
    data = data.drop_duplicates()
    print(f"Duplicates removed. New dimensions: {data.shape}")


In [None]:
# Fill missing values for numerical and categorical columns
for col in data.select_dtypes(include=[np.number]):
    if data[col].isnull().sum() > 0:
        data[col].fillna(data[col].mean(), inplace=True)

for col in data.select_dtypes(include=[object]):
    if data[col].isnull().sum() > 0:
        data[col].fillna(data[col].mode()[0], inplace=True)

# Verify missing values after handling
print("\nMissing Values After Handling:")
print(data.isnull().sum())


In [None]:
# Automatically standardize column names (lowercase, replace spaces with underscores)
data.columns = data.columns.str.lower().str.replace(' ', '_').str.replace('.', '_')

print("\nRenamed Columns:")
print(data.columns)


In [None]:
# Identify outliers using IQR
for col in data.select_dtypes(include=[np.number]):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = ((data[col] < lower_bound) | (data[col] > upper_bound)).sum()
    print(f"Outliers in {col}: {outliers}")


In [None]:
# Numerical Features
print("\nUnivariate Analysis - Numerical Features")
for col in data.select_dtypes(include=[np.number]).columns:
    print(f"\nStatistics for {col}:")
    print(f"Skewness: {skew(data[col])}, Kurtosis: {kurtosis(data[col])}")

    plt.figure()
    sns.histplot(data[col], kde=True, bins=30, color='blue')
    plt.title(f"Distribution of {col}")
    plt.show()

# Categorical Features
print("\nUnivariate Analysis - Categorical Features")
for col in data.select_dtypes(include=[object]).columns:
    print(f"\nValue Counts for {col}:\n{data[col].value_counts()}")

    plt.figure()
    sns.countplot(y=data[col], order=data[col].value_counts().index, palette='viridis')
    plt.title(f"Frequency of {col}")
    plt.show()


In [None]:
# Numerical vs Numerical
print("\nBivariate Analysis - Correlation Matrix")
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# Numerical vs Categorical
for col in data.select_dtypes(include=[object]).columns:
    for num_col in data.select_dtypes(include=[np.number]).columns:
        plt.figure()
        sns.boxplot(x=data[col], y=data[num_col], palette='Set2')
        plt.title(f"{num_col} by {col}")
        plt.xticks(rotation=45)
        plt.show()


In [None]:
# Pairplot for all numerical variables
sns.pairplot(data.select_dtypes(include=[np.number]), diag_kind='kde', corner=True)
plt.suptitle("Pairplot of Numerical Features", size=16)
plt.show()

# Pairplot with a categorical hue (if applicable)
if 'gender' in data.columns:  # Replace 'gender' with any categorical column of interest
    sns.pairplot(data, hue='gender', diag_kind='kde', corner=True)
    plt.suptitle("Pairplot with Hue: Gender", size=16)
    plt.show()


In [None]:
# Save the cleaned dataset
cleaned_file_path = "BEPS_cleaned.csv"
data.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned dataset saved to {cleaned_file_path}.")
