In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Target 1: Convert categorical data to numeric
# Step 1: Import necessary libraries

# Step 2: Load the dataset using the raw URL
url = "https://raw.githubusercontent.com/RohitSingh218/MudraLoanDataset/main/mudraloandataset.csv"
df = pd.read_csv(url)

# Step 3: Convert categorical columns to numeric using one-hot encoding
categorical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Display the updated DataFrame to confirm changes
df.head()



In [None]:
# Target 2: Find the nulls in the data and treat them
# Hint: Fill the null values with Mean, Median, or Mode

# Step 1: Check for null values in the dataset
print("Null values before treatment:")
print(df.isnull().sum())

# Step 2: Treat null values
for column in df.columns:
    if df[column].isnull().sum() > 0:  # Check if there are null values
        if df[column].dtype in ['float64', 'int64']:  # For numeric columns
            df[column].fillna(df[column].mean(), inplace=True)  # Fill with mean for numeric columns
        elif df[column].dtype == 'object':  # For categorical columns
            df[column].fillna(df[column].mode()[0], inplace=True)  # Fill with mode for categorical columns


# Step 3: Check for null values again to confirm treatment
print("Null values after treatment:")
print(df.isnull().sum())


In [None]:
# Target 3: Find data duplicacy and remove duplicates from the dataset

# Step 1: Check for duplicate rows
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")
df = df[~duplicates]  # Remove duplicates
df.reset_index(drop=True, inplace=True)  # Reset index after removing duplicates
print(f"Number of rows after removing duplicates: {df.shape[0]}")


In [None]:
# Target 4: Plot all the numerical data using Box Plot

# Step 2: Select numerical columns

# ----------------------------- VERSION 1 (Subplots in a Single Figure) -----------------------------
# This version creates a single figure with subplots for each numerical column.
# Use this if there is a manageable number of numerical columns (e.g., fewer than 10).
# Uncomment this section if you want all box plots in one figure.

numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Step 3: Create box plots for each numerical column
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_columns, 1):
    plt.subplot((len(numerical_columns) + 1) // 2, 2, i)  # Arrange subplots dynamically
    sns.boxplot(y=df[col])
    plt.title(f"Box Plot of {col}")
plt.tight_layout()
plt.show()



# code 2: example
# ----------------------------- VERSION 2 (Individual Box Plots) -----------------------------
# This version creates a separate figure for each numerical column's box plot.
# This is helpful when there are many numerical columns, ensuring each plot is visible clearly.

# numerical_columns = df.select_dtypes(include=['number']).columns

# # Plot Box Plot for each numerical column
# for col in numerical_columns:
#     plt.figure(figsize=(8, 5))  # Create a new figure for each box plot
#     sns.boxplot(y=df[col])
#     plt.title(f"Box Plot of {col}")
#     plt.show()

