<a href="https://colab.research.google.com/github/Nithu-Arjunan/Python_projects/blob/master/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import logging

# Setup logging
logging.basicConfig(
    filename='data_cleaning.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)



In [None]:
def data_cleaning (raw_data) :

  """
    Cleans and validates the raw dataset.
    - Logs basic data insights
    - Reports and handles missing values
    - Converts non-numeric columns to numeric where possible
    """


  # Analyzing the data

  logging.info(f"Data shape: {raw_data.shape}")
  logging.info(f"Data columns: {raw_data.columns.tolist()}")

  logging.info("Started data validation and cleaning")

  # Report missing values
  missing_values = raw_data.isnull().sum()
  logging.info(f"Missing values per column : {missing_values}")
  total_missing = missing_values.sum()

  if total_missing > 0:
        logging.warning(f"Total missing values found: {total_missing}")
  else:
        logging.info("No missing values found.")

  # Optional : Drop missing values if needed
  cleaned_data = raw_data.dropna()

  # Validate numeric columns
  columns = cleaned_data.columns
  for col in columns:
            if not pd.api.types.is_numeric_dtype(cleaned_data[col]):
              cleaned_data[col] = pd.to_numeric(cleaned_data[col], errors='coerce')
              if cleaned_data[col].isnull().any():
                logging.warning(f"Column '{col}' contains non-numeric values.")

  # Final check after cleaning
  cleaned_missing = cleaned_data.isnull().sum().sum()
  if cleaned_missing == 0:
      logging.info("Data cleaning completed successfully.")
  else:
      logging.warning("Data cleaning did not complete successfully.")

  return cleaned_data

In [None]:
#  Loading the data

raw_data = pd.read_csv("filename")

# Function calling

cleaned_data = data_cleaning (raw_data)
