In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data. Make sure the file path is correct.
df = pd.read_csv('data/kc_house_data.csv')

# --- Initial Data Inspection ---
print("First 5 rows of the dataset:")
display(df.head())

print("\nGeneral info about the dataset:")
df.info()

# --- Correlation Matrix ---
# Select only numeric columns for the correlation calculation
numeric_cols = df.select_dtypes(include=np.number).columns
corr_matrix = df[numeric_cols].corr()

# Plot the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Features')
plt.show()

# --- Visualizations ---
# First, let's look at the distribution of our target variable - price
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Scatter plot: area vs. price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sqft_living', y='price', data=df)
plt.title('Price vs. Living Area')
plt.xlabel('Living Area (sq. ft.)')
plt.ylabel('Price')
plt.show()

# Scatter plot: year built vs. price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='yr_built', y='price', data=df)
plt.title('Price vs. Year Built')
plt.xlabel('Year Built')
plt.ylabel('Price')
plt.show()