In [1]:
import pandas as pd
train = pd.read_csv("train.csv")  

In [3]:
missing_data = train.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
missing_ratio = (missing_data / len(train)) * 100
missing_overview = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Ratio (%)': missing_ratio.round(2)
})
missing_overview


Unnamed: 0,Missing Count,Missing Ratio (%)
PoolQC,1453,99.52
MiscFeature,1406,96.3
Alley,1369,93.77
Fence,1179,80.75
MasVnrType,872,59.73
FireplaceQu,690,47.26
LotFrontage,259,17.74
GarageType,81,5.55
GarageYrBlt,81,5.55
GarageFinish,81,5.55


In [5]:
# Fill high-missing-rate categorical features with 'None'
high_missing_cols = ["PoolQC", "MiscFeature", "Alley", "Fence","MasVnrType","FireplaceQu"]
train[high_missing_cols] = train[high_missing_cols].fillna("None")

In [8]:

# Define candidate columns to group by
grouping_candidates = ["Neighborhood", "Street", "MSZoning", "LotConfig"]

# Create a list to store comparison results
comparison = []

# Loop over each candidate
for col in grouping_candidates:
    group_stats = train.groupby(col)["LotFrontage"].std()
    avg_std = group_stats.mean()
    max_std = group_stats.max()
    num_groups = group_stats.shape[0]
    
    comparison.append({
        "Grouping Column": col,
        "# of Groups": num_groups,
        "Avg Group Std Dev": round(avg_std, 2),
        "Max Group Std Dev": round(max_std, 2)
    })

# Convert to DataFrame and sort by Avg Group Std Dev (lower is better)
comparison_df = pd.DataFrame(comparison).sort_values(by="Avg Group Std Dev")

# Display
print(comparison_df)


  Grouping Column  # of Groups  Avg Group Std Dev  Max Group Std Dev
0    Neighborhood           25              17.77              32.75
2        MSZoning            5              20.93              23.47
1          Street            2              26.81              29.37
3       LotConfig            5              27.61              40.52


In [None]:
# check LotFrontage group 
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.boxplot(x='Neighborhood', y='LotFrontage', data=train)
plt.xticks(rotation=90)
plt.title('LotFrontage Distribution by Neighborhood')
plt.show()


In [None]:
# Fill the median by neighbourhood into LotFrontage 
# Step 1: Calculate the median LotFrontage per Neighborhood
neighborhood_medians = train.groupby("Neighborhood")["LotFrontage"].median()
# Step 2: Fill missing LotFrontage values using the group-specific medians
train["LotFrontage"] = train.apply(
    lambda row: neighborhood_medians[row["Neighborhood"]]
    if pd.isnull(row["LotFrontage"]) else row["LotFrontage"],
    axis=1
)


In [10]:
# Add new binary column GarageYrBlt_missing to show the presence or absence of a garage.
train["GarageYrBlt_missing"] = train["GarageYrBlt"].isnull().astype(int)
#Fill missing values with 0
train["GarageYrBlt"] = train["GarageYrBlt"].fillna(0)

In [None]:
# List of garage-related categorical features with missing values
garage_cat_features = ["GarageType", "GarageFinish", "GarageQual", "GarageCond"]

# Fill missing values with "None" to indicate no garage
for feature in garage_cat_features:
    train[feature] = train[feature].fillna("None")


In [11]:
# Step 1: Create missing flag
train["MasVnrArea_missing"] = train["MasVnrArea"].isnull().astype(int)
# Step 2: Fill missing values with 0
train["MasVnrArea"] = train["MasVnrArea"].fillna(0)


In [12]:
# List of 5 basement-related features (categorical)
bsmt_features = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

# Step 1: Check rows where all 5 basement features are missing
no_bsmt_mask = train[bsmt_features].isnull().all(axis=1)

# Step 2: Fill those rows with "None" = clearly no basement
train.loc[no_bsmt_mask, bsmt_features] = 'None'

# Step 3: For remaining rows, fill individual missing values with most common value (mode)
for col in bsmt_features:
    mode_val = train[col].mode()[0]
    train[col] = train[col].fillna(mode_val)


In [13]:
# Fill missing Electrical value with the mode
mode_val = train["Electrical"].mode()[0]
train["Electrical"] = train["Electrical"].fillna(mode_val)


In [14]:
train.to_csv("train_filled.csv", index=False)