In [2]:
import pandas as pd

In [3]:
import pandas as pd
import numpy as np
# ---------- Step 1: Load the Dataset ----------
df = pd.read_csv("house_prices.csv")

print("=== BEFORE CLEANING ===")
print(df.head())
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())

=== BEFORE CLEANING ===
   LotArea  OverallQual  YearBuilt  TotalBsmtSF  GrLivArea  GarageCars  \
0     8450            7       2003          856       1710           2   
1     9600            6       1976         1262       1262           2   
2    11250            7       2001          920       1786           2   
3     9550            7       1915          756       1717           3   
4    14260            8       2000         1145       2198           3   

   SalePrice  
0     208500  
1     181500  
2     223500  
3     140000  
4     250000  

Missing Values:
 LotArea        0
OverallQual    0
YearBuilt      0
TotalBsmtSF    0
GrLivArea      0
GarageCars     0
SalePrice      0
dtype: int64

Duplicate Rows: 0


In [4]:
# ---------- Step 2: Remove Duplicates ----------
df = df.drop_duplicates()


In [5]:
# ---------- Step 3: Handle Missing Values ----------
# Fill numeric columns with their median
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [6]:

# Fill categorical columns with mode (most frequent value)
categorical_cols = df.select_dtypes(exclude=[np.number]).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [7]:

# ---------- Step 4: Treat Outliers ----------
# Example: Remove or cap outliers in 'Price' or 'Price_USD' column (if exists)
if 'Price' in df.columns:
    col = 'Price'
elif 'Price_USD' in df.columns:
    col = 'Price_USD'
else:
    col = None

if col:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    upper_limit = Q3 + 1.5 * IQR
    lower_limit = Q1 - 1.5 * IQR

    # Cap outliers
    df[col] = np.where(df[col] > upper_limit, upper_limit,
                       np.where(df[col] < lower_limit, lower_limit, df[col]))

# ---------- Step 5: Save Cleaned File ----------
df.to_csv("house_prediction_cleaned.csv", index=False)

print("\n=== AFTER CLEANING ===")
print(df.head())
print("\nMissing Values After Cleaning:\n", df.isnull().sum())
print("\nDuplicate Rows After Cleaning:", df.duplicated().sum())
print("\nCleaned dataset saved as: house_prediction_cleaned.csv")


=== AFTER CLEANING ===
   LotArea  OverallQual  YearBuilt  TotalBsmtSF  GrLivArea  GarageCars  \
0     8450            7       2003          856       1710           2   
1     9600            6       1976         1262       1262           2   
2    11250            7       2001          920       1786           2   
3     9550            7       1915          756       1717           3   
4    14260            8       2000         1145       2198           3   

   SalePrice  
0     208500  
1     181500  
2     223500  
3     140000  
4     250000  

Missing Values After Cleaning:
 LotArea        0
OverallQual    0
YearBuilt      0
TotalBsmtSF    0
GrLivArea      0
GarageCars     0
SalePrice      0
dtype: int64

Duplicate Rows After Cleaning: 0

Cleaned dataset saved as: house_prediction_cleaned.csv


In [8]:
summary = {
    "Before Duplicates": [df.duplicated().sum()],
    "After Duplicates": [0],
    "Before Missing": [df.isnull().sum().sum()],
    "After Missing": [0]
}
summary_df = pd.DataFrame(summary)
print(summary_df)


   Before Duplicates  After Duplicates  Before Missing  After Missing
0                  0                 0               0              0
