<a href="https://colab.research.google.com/github/ShivaniSarkate/Sales-Insights-Demand-Forecasting-for-a-Retail-Chain/blob/main/Cleaned_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# 1. Load the dataset
df = pd.read_csv("stores.csv")

# 2. Standardize column names
df.columns = df.columns.str.strip().str.lower()

# 3. Remove duplicate rows
df = df.drop_duplicates()

# 4. Ensure Store IDs are unique
# If duplicates exist, keep the first occurrence
df = df.drop_duplicates(subset="store", keep="first")

# 5. Standardize store type values (A, B, C only)
df['type'] = df['type'].str.upper()
df = df[df['type'].isin(['A', 'B', 'C'])]

# 6. Handle missing values in 'size'
if df['size'].isnull().sum() > 0:
    # Fill missing sizes with median size of that store type
    df['size'] = df.groupby('type')['size'].transform(
        lambda x: x.fillna(x.median())
    )

# 7. Handle outliers in 'size'
# Define reasonable bounds (e.g., between 30,000 and 250,000 sq ft)
df = df[(df['size'] >= 30000) & (df['size'] <= 250000)]

# 8. Save cleaned dataset
df.to_csv("Cleaned_Stores.csv", index=False)

print("Cleaned dataset saved as Cleaned_Stores.csv")
print(df.head())