In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. Load the Dataset
df = pd.read_csv('house_prices.csv')

# Inspect the first few rows and overall info
print("Head of the dataset:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nSummary statistics:")
print(df.describe())
print("\nMissing values per column:")
print(df.isnull().sum())

# 2. Exploratory Data Analysis
# Distribution of numerical features: Size and Price
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df['Size'], kde=True, bins=20)
plt.title('Distribution of Size')

plt.subplot(1, 2, 2)
sns.histplot(df['Price'], kde=True, bins=20)
plt.title('Distribution of Price')
plt.show()

# Boxplots to identify potential outliers
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.boxplot(x=df['Size'])
plt.title('Boxplot of Size')

plt.subplot(1, 2, 2)
sns.boxplot(x=df['Price'])
plt.title('Boxplot of Price')
plt.show()

# 3. Data Preprocessing
# Separate predictors and target
X = df[['Size', 'Location', 'Number of Rooms']]
y = df['Price']

# Define which features are numerical and which are categorical
numerical_features = ['Size', 'Number of Rooms']
categorical_features = ['Location']

# Create a ColumnTransformer to:
# - Scale numerical features using StandardScaler
# - One-hot encode categorical features (drop one category to avoid multicollinearity)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# 4. Model Training
# Build a pipeline that first preprocesses the data and then fits a linear regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the linear regression model
pipeline.fit(X_train, y_train)

# 5. Model Evaluation
# Predict on the test set
y_pred = pipeline.predict(X_test)

# Calculate evaluation metrics: RMSE and R²
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print("Root Mean Squared Error (RMSE): {:.3f}".format(rmse))
print("R² Score: {:.3f}".format(r2))

# 6. Feature Insights: Extracting model coefficients
# Get feature names after preprocessing:
# Numerical features remain the same; for categorical features, retrieve names from OneHotEncoder.
num_features = numerical_features
cat_features = pipeline.named_steps['preprocessor']\
                       .named_transformers_['cat']\
                       .get_feature_names_out(categorical_features)
# Combine all feature names
feature_names = np.concatenate([num_features, cat_features])

# Retrieve coefficients from the linear regression model
coefficients = pipeline.named_steps['regressor'].coef_
feature_coef = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})
# Sort features by the absolute value of their coefficients
feature_coef = feature_coef.reindex(feature_coef.Coefficient.abs().sort_values(ascending=False).index)
print("\nFeature Insights (Coefficient Impact):")
print(feature_coef)
