In [21]:
import pandas as pd
import os

def preprocess_csv(file_path, output_path):
    # Load the data
    df = pd.read_csv(file_path)
    
    # Remove unnecessary columns (adjust based on your data)
    unwanted_columns = ['id', 'state_id', 'census_state_id', 'census_state_name', 
                        'census_district_id', 'census_district_name', 'district_id', 'grade']
    df = df.drop(columns=[col for col in unwanted_columns if col in df.columns], errors='ignore')
    
    # Drop rows with missing essential values (e.g., `grade`, `min_price`, etc.)
    df = df.dropna(subset=['min_price', 'max_price', 'modal_price', 'date'])
    
    # Convert `date` to a datetime object
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])  # Drop rows with invalid `date` values
    
    # Feature engineering (optional):
    # Add `price_range` column if both `min_price` and `max_price` are available
    if 'min_price' in df.columns and 'max_price' in df.columns:
        df['price_range'] = df['max_price'] - df['min_price']
    
    # Save the cleaned data to a new CSV
    os.makedirs(output_path, exist_ok=True)  # Create output directory if it doesn't exist
    cleaned_file_path = os.path.join(output_path, os.path.basename(file_path).replace(".csv", "_cleaned.csv"))
    df.to_csv(cleaned_file_path, index=False)
    print(f"Cleaned file saved at: {cleaned_file_path}")

# List of input CSV file paths (replace with actual file paths)
input_files = [
    "data (1).csv",  # Replace with the path to your 2020 data file
    "data (2).csv",  # Replace with the path to your 2021 data file
    "data (3).csv",  # Replace with the path to your 2022 data file
    "data (4).csv",  # Replace with the path to your 2023 data file
]

# Output directory for cleaned files
output_directory = "cleaned_data"

# Preprocess each file
for file_path in input_files:
    preprocess_csv(file_path, output_directory)


  df = pd.read_csv(file_path)


Cleaned file saved at: cleaned_data\data (1)_cleaned.csv


  df = pd.read_csv(file_path)


Cleaned file saved at: cleaned_data\data (2)_cleaned.csv


  df = pd.read_csv(file_path)


Cleaned file saved at: cleaned_data\data (3)_cleaned.csv


  df = pd.read_csv(file_path)


Cleaned file saved at: cleaned_data\data (4)_cleaned.csv
