In [2]:
# Import Required Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import argparse
import os
import math
from typing import Tuple, List, Optional, Dict


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.base import TransformerMixin, BaseEstimator
import joblib

In [8]:
# # loading of dataset
# source="https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/EasyVisa%20(1).csv"
# data_path = pd.read_csv(source)
# data_path.head()
# print(f"Data loaded successfully! Shape: {data_path.shape}")
# # # -------------------------------
# # def load_dataset(path_or_url):
# #     """Load dataset from a local path or URL."""
# #     df = pd.read_csv(path_or_url)
# #     print(f"Data loaded successfully! Shape: {df.shape}")
#     return df

In [10]:
data_path="https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/EasyVisa%20(1).csv"

In [None]:
#!/usr/bin/env python3# Import Required Libraries
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")


def perform_eda(data_path, output_dir="eda_results"):
    """
    Performs full EDA on dataset and saves all outputs.
    -------------------------------------------------------
    Parameters:
        data_path (str): Path or URL to the dataset.
        output_dir (str): Folder to save results.
    -------------------------------------------------------
    Returns:
        df (pd.DataFrame): Cleaned dataset ready for modeling.
    -------------------------------------------------------
    """

    #================================
    # 1. Create Output Folder
    #================================
    os.makedirs(output_dir, exist_ok=True)
    print(f"Results will be saved to: {output_dir}")

    #================================
    # 2. Load Dataset
    #================================
    df = pd.read_csv(data_path)
    print(f" Data Loaded — Shape: {df.shape}\n")

    #================================
    # 3. Save Basic Info
    #================================
    # ---------------------------
    with open(os.path.join(output_dir, "data_summary.txt"), "w", encoding="utf-8") as f:
        f.write("=== DATA SUMMARY REPORT ===\n\n")
        f.write(f"Dataset Shape: {df.shape}\n\n")
        f.write("DATA TYPES:\n")
        f.write(str(df.dtypes))
        f.write("\n\nMISSING VALUES:\n")
        f.write(str(df.isnull().sum()))
        f.write("\n\nDESCRIPTIVE STATS:\n")
        f.write(str(df.describe(include='all')))
        f.write("\n\nDUPLICATES:\n")
        f.write(str(df.duplicated().sum()))
    print("Data summary saved as 'data_summary.txt'")

    # ==================================
    # 4. Missing Values Heatmap
    # ==================================
    plt.figure(figsize=(10, 5))
    sns.heatmap(df.isnull(), cbar=False, cmap="coolwarm")
    plt.title("Missing Values Heatmap")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "missing_values_heatmap.png"))
    plt.close()

    # ==================================
    # 5. Remove Duplicates
    # ==================================
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        df.drop_duplicates(inplace=True)
        print(f"Removed {dup_count} duplicate rows.")

    # ==================================
    # 6. Identify Numeric & Categorical Columns
    # ==================================
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()

    # ==================================
    # 7. Univariate Analysis — Save Plots
    # ==================================
    for col in num_cols:
        plt.figure(figsize=(6, 4))
        sns.histplot(df[col], kde=True, color="royalblue")
        plt.title(f"Distribution of {col}")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"dist_{col}.png"))
        plt.close()
        

    for col in cat_cols:
        plt.figure(figsize=(6, 4))
        df[col].value_counts().head(10).plot(kind="bar", color="teal")
        plt.title(f"Top Categories for {col}")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"cat_{col}.png"))
        plt.close()

    print(f"Distribution plots saved for {len(num_cols) + len(cat_cols)} columns.")

    # ==================================
    # 8. Correlation Heatmap
    # ==================================
    if len(num_cols) > 1:
        plt.figure(figsize=(10, 6))
        sns.heatmap(df[num_cols].corr(), annot=True, cmap="viridis", fmt=".2f")
        plt.title("Correlation Heatmap")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "correlation_heatmap.png"))
        plt.close()
        print("Correlation heatmap saved.")

    # ==================================
    # 9. Outlier Detection Plots
    # ==================================
    for col in num_cols:
        plt.figure(figsize=(6, 3))
        sns.boxplot(x=df[col], color="coral")
        plt.title(f"Boxplot for {col}")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"boxplot_{col}.png"))
        plt.close()
    print("Outlier boxplots saved.")

    # ==================================
    # 10. Handle Missing Values
    # ==================================
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)

    for col in cat_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

    print("Missing values filled: Median (numeric), Mode (categorical).")

    # ==================================
    # 11. Save Cleaned Dataset
    # ==================================
    cleaned_path = os.path.join(output_dir, "cleaned_easyvisa.csv")
    df.to_csv(cleaned_path, index=False)
    print(f"Cleaned dataset saved to '{cleaned_path}'")

    print("\nEDA + Cleaning Completed. All summary outputs of EDA saved successfully.")
    return df




In [19]:
df=perform_eda(data_path,output_dir="eda_summary")


Results will be saved to: eda_summary
 Data Loaded — Shape: (25480, 12)

Data summary saved as 'data_summary.txt'
Distribution plots saved for 12 columns.
Correlation heatmap saved.
Outlier boxplots saved.
Missing values filled: Median (numeric), Mode (categorical).
Cleaned dataset saved to 'eda_summary\cleaned_easyvisa.csv'

EDA + Cleaning Completed. All summary outputs of EDA saved successfully.
