#### Data Cleaninig and EDA

This code defines a `DataProcessor` class designed to load, clean, analyze, and visualize CSV data. The class accepts directory paths for CSV files, visualizations, and cleaned data. It ensures necessary directories exist and uses seaborn's pastel theme for visualizations.

The `load_data()` method loads CSV files from the specified directory into a dictionary. The `_wrangle()` method processes each file by ensuring the correct header position, checking for missing values in critical columns like "YEAR" and "DOY," and creating a new "date" column for time-based analysis. It also handles renaming columns and setting the "date" as the index.

The `perform_eda()` method generates and saves various plots, including histograms, time series, correlation heatmaps (skipped for datasets with insufficient numeric columns), and boxplots. It prints summary statistics, missing values, and duplicates information.

The `remove_multicollinear_columns()` method removes highly correlated columns based on a defined threshold and saves cleaned data.

Finally, the `view_visuals()` method lists all generated visualizations. This structure provides an efficient way to preprocess, analyze, and visualize time series data.


In [None]:
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


class DataProcessor:
    def __init__(self, csv_dir, visuals_dir="visuals", cleaned_dir="cleaned_data"):
        self.csv_dir = csv_dir
        self.visuals_dir = visuals_dir
        self.cleaned_dir = cleaned_dir
        self.dfs = {}

        # Ensure directories exist
        os.makedirs(self.visuals_dir, exist_ok=True)
        os.makedirs(self.cleaned_dir, exist_ok=True)

        # Set seaborn style
        sns.set_theme(style="whitegrid", palette="pastel")

    def load_data(self):
        """Loads all CSV files from the directory into a dictionary."""
        csv_files = glob.glob(os.path.join(self.csv_dir, "*.csv"))

        for file in csv_files:
            filename = os.path.splitext(os.path.basename(file))[0]
            try:
                self.dfs[filename] = self._wrangle(file)
                print(f"✅ Successfully loaded: {filename}")
            except Exception as e:
                print(f"❌ Error loading {file}: {e}")

    def _wrangle(self, path):
        """Processes individual CSV files (feature engineering & cleaning)."""
        try:
            # Ensure header is at the correct position
            df = pd.read_csv(path, header=20)
            # Check the first 30 rows to make sure the header is positioned correctly
            if df.head(30).isnull().any().any():  # Check for missing values in the first 30 rows
                print(f"⚠ Potential issue with header in {path}, check data manually.")

            # Ensure essential columns exist
            required_columns = {"YEAR", "DOY"}
            if not required_columns.issubset(df.columns):
                raise ValueError(f"Missing required columns in {path}")

            # Check for missing values in YEAR or DOY
            if df["YEAR"].isnull().any() or df["DOY"].isnull().any():
                raise ValueError(f"Missing YEAR or DOY values in {path}")

            # Date feature engineering
            df["date"] = df["YEAR"].astype(str) + "-" + df["DOY"].astype(str)
            df["date"] = pd.to_datetime(df["date"], format="%Y-%j")

            # Set date as index
            df.set_index("date", inplace=True)
            df.drop(["YEAR", "DOY"], axis=1, inplace=True)

            # Rename precipitation column if it exists
            if "PRECTOTCORR" in df.columns:
                df.rename(columns={"PRECTOTCORR": "PRECIPITATION"}, inplace=True)

            print(f"📌 Successfully processed {path}: {df.shape}")
            return df
        except Exception as e:
            print(f"❌ Error processing {path}: {e}")
            return None

    def perform_eda(self):
        """Runs EDA and saves visualizations."""
        for filename, df in self.dfs.items():
            try:
                print(f"\n🔍 Running EDA for {filename}...\n")
                print(f"📊 Summary Statistics for {filename}:\n", df.describe(), "\n")
                print(f"🔹 Missing Values in {filename}:\n", df.isnull().sum(), "\n")
                print(f"🔹 Number of duplicate rows in {filename}: {df.duplicated().sum()}\n")

                # Save Histogram for Key Features
                fig, axes = plt.subplots(1, min(3, len(df.columns)), figsize=(14, 6))
                fig.suptitle(f"Feature Distributions - {filename}", fontsize=16, fontweight="bold")

                for i, feature in enumerate(df.columns[:3]):  # Limit to 3 plots
                    if len(df.columns) > 1:
                        sns.histplot(df[feature], bins=30, kde=True, ax=axes[i])
                    else:
                        sns.histplot(df[feature], bins=30, kde=True, ax=axes)

                    axes[i].set_title(f"Distribution of {feature}")

                plt.tight_layout(rect=[0, 0, 1, 0.95])
                plt.savefig(f"{self.visuals_dir}/{filename}_histogram.png")
                plt.close()

                # Save Time Series Plot
                plt.figure(figsize=(14, 6))
                for feature in df.columns[:3]:  # Limit to 3 features
                    sns.lineplot(data=df, x=df.index, y=feature, label=feature, linewidth=2)

                plt.title(f"Time Series Trends - {filename}", fontsize=14, fontweight="bold")
                plt.xlabel("Date")
                plt.ylabel("Values")
                plt.legend()
                plt.savefig(f"{self.visuals_dir}/{filename}_timeseries.png")
                plt.close()

                # Save Correlation Heatmap
                if df.shape[1] < 2:
                    print(f"⚠ Skipping correlation heatmap for {filename} (not enough numerical columns).")
                else:
                    plt.figure(figsize=(10, 6))
                    sns.heatmap(df.corr(), annot=True, cmap="coolwarm", linewidths=0.5, fmt=".2f")
                    plt.title(f"Correlation Heatmap - {filename}", fontsize=14, fontweight="bold")
                    plt.savefig(f"{self.visuals_dir}/{filename}_heatmap.png")
                    plt.close()

                # Save Boxplots for Key Features
                plt.figure(figsize=(14, 6))
                df.boxplot(rot=45)
                plt.title(f"Boxplots of Features - {filename}", fontsize=14, fontweight="bold")
                plt.savefig(f"{self.visuals_dir}/{filename}_boxplot.png")
                plt.close()

                print(f"✅ EDA completed successfully for {filename}")
            except Exception as e:
                print(f"❌ Error during EDA for {filename}: {e}")

    def remove_multicollinear_columns(self, threshold=1):
        """Removes highly correlated columns and saves cleaned data."""
        for df_name, df in self.dfs.items():
            try:
                corr_matrix = df.corr().abs()
                # Avoid checking duplicate pairs using np.triu()
                upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
                to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

                df_cleaned = df.drop(columns=to_drop)
                df_cleaned.to_csv(os.path.join(self.cleaned_dir, f"{df_name}_cleaned.csv"), index=True)
                print(f"✔ Successfully saved cleaned DataFrame at: {self.cleaned_dir}/{df_name}_cleaned.csv")
            except Exception as e:
                print(f"❌ Error cleaning {df_name}: {e}")

    def view_visuals(self):
        """Lists all available visualizations."""
        try:
            visuals = [file for file in os.listdir(self.visuals_dir) if file.endswith(".png")]
            if not visuals:
                print(f"⚠ No visualizations found in {self.visuals_dir}.")
            else:
                print("\n📸 Available Visualizations:")
                for visual in visuals:
                    print(f"- {visual}")
        except Exception as e:
            print(f"❌ Error listing visuals: {e}")


if __name__ == "__main__":
    csv_dir = ... # Add the path to the directory containing the CSV files
    processor = DataProcessor(csv_dir ) 
    processor.load_data()
    processor.perform_eda()
    processor.remove_multicollinear_columns()
    processor.view_visuals()


📌 Successfully processed C:\Users\Adetona Precious I\Documents\smartfarm\EDA\20. Rano, S Kano.csv: (4018, 12)
✅ Successfully loaded: 20. Rano, S Kano
📌 Successfully processed C:\Users\Adetona Precious I\Documents\smartfarm\EDA\21. Tudun-Wada, S Kano.csv: (4018, 12)
✅ Successfully loaded: 21. Tudun-Wada, S Kano
📌 Successfully processed C:\Users\Adetona Precious I\Documents\smartfarm\EDA\22. Doguwa, S Kano.csv: (4018, 12)
✅ Successfully loaded: 22. Doguwa, S Kano
📌 Successfully processed C:\Users\Adetona Precious I\Documents\smartfarm\EDA\23. Madobi, S Kano.csv: (4018, 12)
✅ Successfully loaded: 23. Madobi, S Kano
📌 Successfully processed C:\Users\Adetona Precious I\Documents\smartfarm\EDA\24. Kura, S Kano.csv: (4018, 12)
✅ Successfully loaded: 24. Kura, S Kano

🔍 Running EDA for 20. Rano, S Kano...

📊 Summary Statistics for 20. Rano, S Kano:
                T2M      T2M_MAX      T2M_MIN         RH2M  PRECIPITATION  \
count  4018.000000  4018.000000  4018.000000  4018.000000    4018.0000