In [2]:
# Import Libraries
import pandas as pd
import numpy as np
from google.colab import files

# Upload and Load Data
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

Saving Housing dataset 1.1.csv to Housing dataset 1.1.csv
Saving Housing dataset 1.2.csv to Housing dataset 1.2.csv


In [3]:
# Shape and first 5 rows
print(f"Shape: {df.shape}")
print(df.head())

# Data types
print("\n Data Types ")
print(df.dtypes)

# Missing values
print("\n Missing Values ")
print(df.isnull().sum())

# Descriptive statistics
print("\n Descriptive Statistics ")
print(df.describe(include='all'))

Shape: (1459, 80)
     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC  Fence MiscFeature  \
0         Lvl    AllPub  ...         120        0    NaN  MnPrv         NaN   
1         Lvl    AllPub  ...           0        0    NaN    NaN        Gar2   
2         Lvl    AllPub  ...           0        0    NaN  MnPrv         NaN   
3         Lvl    AllPub  ...           0        0    NaN    NaN         NaN   
4         HLS    AllPub  ...         144        0    NaN    NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleCondition 

In [4]:
df1 = pd.read_csv('Housing dataset 1.1.csv')
df2 = pd.read_csv('Housing dataset 1.2.csv')
df = pd.concat([df1, df2], axis=0).reset_index(drop=True)

print(f"Combined shape: {df.shape}")

Combined shape: (2919, 81)


In [5]:
print(df.isnull().sum().sort_values(ascending=False).head(20))

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
MasVnrType      1766
SalePrice       1459
FireplaceQu     1420
LotFrontage      486
GarageCond       159
GarageYrBlt      159
GarageFinish     159
GarageQual       159
GarageType       157
BsmtExposure      82
BsmtCond          82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrArea        23
MSZoning           4
dtype: int64


In [7]:
# Columns to drop
cols_to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                'LotFrontage', 'Id']
df = df.drop(columns=cols_to_drop)

# Missing categorical data
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    mode_val = df[col].mode()[0]
    df[col] = df[col].fillna(mode_val)

# Missing numeric data
numeric_cols = df.select_dtypes(include=['int64','float64']).columns
for col in numeric_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)


In [8]:
df = df[df['LotArea'] > 0]
df = df[df['YearBuilt'] > 1800]
df = df[df['GarageYrBlt'] > 1800]

# outliers
df['LotArea'] = df['LotArea'].clip(upper=30000)
df['GrLivArea'] = df['GrLivArea'].clip(upper=4000)

In [9]:

# 6. Clean Categorical Data
for col in categorical_cols:
    counts = df[col].value_counts()
    rare_categories = counts[counts < 10].index
    df[col] = df[col].replace(rare_categories, 'Other')

# Convert binary categories to 0/1
binary_cols = ['Street', 'CentralAir']
for col in binary_cols:
    df[col] = df[col].map({'Pave': 1, 'Grvl': 0, 'Y': 1, 'N': 0})

In [10]:
current_year = pd.to_datetime('now').year
df['HouseAge'] = current_year - df['YearBuilt']
df['TotalBath'] = df['FullBath'] + 0.5*df['HalfBath']
df['TotalSF'] = df['GrLivArea'] + df['TotalBsmtSF']

In [11]:
print("\nMissing Values After Cleaning:")
print(df.isnull().sum().sum())

print("\nFinal Data Types:")
print(df.dtypes.value_counts())


Missing Values After Cleaning:
0

Final Data Types:
object     36
int64      28
float64    13
Name: count, dtype: int64


In [12]:
# Save Cleaned Data
df.to_csv('cleaned combined housing data.csv', index=False)
files.download('cleaned combined housing data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>