In [None]:

# 📦 Import Packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# 📥 Load Dataset
df = pd.read_csv("/content/kc_house_data.csv")

# 🧐 Initial Overview
print("📌 Dataset shape:", df.shape)
print("\n📌 Data types:")
print(df.dtypes)
print("\n📌 Missing values:")
print(df.isnull().sum())
print("\n📌 First few rows:")
display(df.head())
print("\n📌 Descriptive statistics:")
display(df.describe())

# 🧼 CLEANING
df = df.drop_duplicates()
df = df[(df['bedrooms'] > 0) & (df['bathrooms'] > 0)]

print("\n✅ Cleaning Completed")
print("Remaining missing values:")
print(df.isnull().sum())
print("\n--- Cleaning Conclusion ---")
print("- Removed duplicates")
print("- Removed rows with 0 bedrooms or 0 bathrooms")
print("- No missing values found")

# 📊 UNIVARIATE ANALYSIS
num_cols = df.select_dtypes(include=np.number).columns
for col in num_cols:
    plt.figure(figsize=(8,4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.show()

print("\n--- Univariate Conclusion ---")
print("- Most numerical features are right-skewed")
print("- 'price', 'sqft_living', and 'sqft_lot' have long tails")
print("- 'bedrooms' is concentrated between 2 and 4")

# 🔄 BIVARIATE ANALYSIS
corr_matrix = df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

print("\nTop correlations with price:")
print(corr_matrix['price'].sort_values(ascending=False)[1:10])
print("\n--- Bivariate Conclusion ---")
print("- Highest correlations: sqft_living, grade, bathrooms, sqft_above")
print("- view and waterfront also influence price positively")

# 🔁 MULTIVARIATE ANALYSIS
features = ['sqft_living', 'grade', 'bathrooms', 'sqft_above', 'view', 'floors']
for feat in features:
    plt.figure(figsize=(8,5))
    sns.scatterplot(x=df[feat], y=df['price'])
    plt.title(f"Price vs {feat}")
    plt.xlabel(feat)
    plt.ylabel("Price")
    plt.show()

print("\n--- Multivariate Conclusion ---")
print("- Price increases with sqft_living, grade, and bathrooms")
print("- Some relationships are nonlinear — may benefit from transformation")

# ⚠️ OUTLIERS
plt.figure(figsize=(10,5))
sns.boxplot(x=df['price'])
plt.title("Boxplot - Price")
plt.show()

plt.figure(figsize=(10,5))
sns.boxplot(x=df['bedrooms'])
plt.title("Boxplot - Bedrooms")
plt.show()

print("\n--- Outliers Conclusion ---")
print("- Some extreme outliers: prices > $3M, bedrooms > 10")
print("- Outliers may skew modeling if not handled")

# 🗺️ MAPPING
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x="long", y="lat", hue="price", palette="viridis", alpha=0.6)
plt.title("House Locations (colored by price)")
plt.show()

print("\n--- Mapping Conclusion ---")
print("- High-priced houses are located near specific geographical areas")
print("- Some clustering visible in lat/long space")

# 🧠 FINAL INSIGHTS
print("\n--- Key Insights ---")
print(f"- Dataset contains {df.shape[0]} houses and {df.shape[1]} features")
print("- Most important variables for price: sqft_living, grade, bathrooms")
print("- Waterfront houses are rare and very expensive")
print("- Extreme outliers exist and need preprocessing")
print("- Geographical data (lat/long) can help with spatial analysis")
