In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [None]:
# Enable inline plots
%matplotlib inline

In [29]:
# Load dataset
file_path = r"C:\Users\lione\OneDrive\Desktop\Academic city University\Level 300\Machine learning\Assignment\1553768847-housing.csv"
housing = pd.read_csv(file_path)


In [31]:
# Display first few rows
print(housing.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                  41          880           129.0   
1    -122.22     37.86                  21         7099          1106.0   
2    -122.24     37.85                  52         1467           190.0   
3    -122.25     37.85                  52         1274           235.0   
4    -122.25     37.85                  52         1627           280.0   

   population  households  median_income ocean_proximity  median_house_value  
0         322         126         8.3252        NEAR BAY              452600  
1        2401        1138         8.3014        NEAR BAY              358500  
2         496         177         7.2574        NEAR BAY              352100  
3         558         219         5.6431        NEAR BAY              341300  
4         565         259         3.8462        NEAR BAY              342200  


In [33]:
# Check for missing values
print(housing.isnull().sum())

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64


In [None]:
# Fill missing values in 'total_bedrooms' with the median
housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace=True)

In [None]:
# Convert 'ocean_proximity' to categorical numerical values
housing = pd.get_dummies(housing, drop_first=True)

In [None]:
# Feature Engineering
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_household'] = housing['population'] / housing['households']

In [41]:
# Drop unnecessary columns
housing.drop(['total_rooms', 'total_bedrooms', 'population', 'households'], axis=1, inplace=True)

In [43]:
# Define features and target variable
X = housing.drop('median_house_value', axis=1)
y = housing['median_house_value']


In [45]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [49]:
# Initialize regression models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

In [51]:
# Evaluate models using cross-validation
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-cv_scores)
    print(f"{name} - RMSE: {rmse_scores.mean():.2f} ± {rmse_scores.std():.2f}")

Linear Regression - RMSE: 70628.68 ± 729.29
Decision Tree Regressor - RMSE: 70078.34 ± 451.78
Random Forest Regressor - RMSE: 49863.19 ± 383.49
Gradient Boosting Regressor - RMSE: 52779.41 ± 725.34
Support Vector Regressor - RMSE: 118367.12 ± 1609.11


In [None]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{name} Test Set RMSE: {rmse:.2f}")


In [None]:
# Visualization
plt.figure(figsize=(10, 6))
sns.histplot(housing['median_house_value'], bins=30, kde=True, color='blue')
plt.title('Distribution of Median House Value')
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()  # Ensure plot is displayed

plt.figure(figsize=(12, 6))
sns.heatmap(housing.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()  # Ensure plot is displayed

plt.figure(figsize=(10, 6))
sns.scatterplot(x=housing['median_income'], y=housing['median_house_value'], alpha=0.5, color='red')
plt.title('Median Income vs Median House Value')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.grid(True)
plt.show()  # Ensure plot is displayed
