In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
pip install xgboost

In [None]:
data = pd.read_csv("housing.csv")
data

In [None]:
data.info()

In [None]:
data["ocean_proximity"].value_counts()

In [None]:
# Define the mapping
category_map = {
    '<1H OCEAN': 1,
    'INLAND': 2,
    'NEAR OCEAN': 3,
    'NEAR BAY': 4,
    'ISLAND': 5
}

# Apply the mapping
data["ocean_proximity"] = data["ocean_proximity"].map(category_map)


In [None]:
data.describe()

In [None]:
print(data.columns)
print(len(data.columns))

In [None]:
data.isnull().sum()

In [None]:
sns.boxplot(data)

In [None]:
scaler = MinMaxScaler()
n_data = scaler.fit_transform(data[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity']])

# Convert back to DataFrame
n_df = pd.DataFrame(n_data, columns=['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'])
n_df

In [None]:
plt.figure(figsize=(12, 6))  
sns.boxplot(data=n_df)
plt.xticks(rotation=45, ha='right') 
plt.tight_layout() 
plt.show()

In [None]:
numeric_data = n_df.select_dtypes(include=['float64', 'int64'])

# Initialize KNN Imputer with k=3 or k=5
imputer = KNNImputer(n_neighbors=3)

# Apply imputation
imputed_array = imputer.fit_transform(numeric_data)

# Create a new DataFrame with imputed values
imputed_df = pd.DataFrame(imputed_array, columns=numeric_data.columns)

# Replace the original numeric columns in your main DataFrame
n_df[numeric_data.columns] = imputed_df

In [None]:

# Fit Isolation Forest
iso = IsolationForest(contamination=0.02, random_state=42)
outliers = iso.fit_predict(n_df)

# Keep only inliers
nc_df = n_df[outliers == 1]

In [None]:
Q1 = nc_df["ocean_proximity"].quantile(0.25)
Q3 = nc_df["ocean_proximity"].quantile(0.75)
IQR = Q3 - Q1

# Define upper bound
upper_bound = Q3 + 1.5 * IQR

# Filter out rows with outlier values
nc_df= nc_df[nc_df["ocean_proximity"] <= upper_bound]

In [None]:
plt.figure(figsize=(12, 6))  
sns.boxplot(data=nc_df)
plt.xticks(rotation=45, ha='right') 
plt.tight_layout() 
plt.show()

In [None]:
corr_matrix = nc_df.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt=".2f")
plt.title("Correlation Heatmap (Features vs. Target)")
plt.show()

In [None]:
df_encoded = pd.get_dummies(nc_df, columns=["ocean_proximity"], drop_first=True)

In [None]:
# List of columns to keep (adjust based on your preference)
final_features = [
    "median_income",
    "latitude",
    "housing_median_age",  # Optional: Remove if it doesn't improve performance
] + [col for col in df_encoded.columns if "ocean_proximity_" in col]  # Add one-hot columns

# Create new DataFrame
df_final1 = df_encoded[final_features].copy()

# Add target variable (if needed)
df_final1["median_house_value"] = nc_df["median_house_value"]

In [None]:
correlation = df_final1.corr()["median_house_value"].sort_values(ascending=False)
print(correlation)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Select final features
X = df_final1[["median_income", "ocean_proximity_0.25", "ocean_proximity_0.75", "latitude"]]
y = df_final1["median_house_value"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate
print(f"R² Score: {model.score(X_test, y_test):.3f}")

In [None]:
df_final1["income_x_latitude"] = df_final1["median_income"] * df_final1["latitude"]

In [None]:
df_final["log_median_income"] = np.log(df_final["median_income"])

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
print(f"R²: {rf.score(X_test, y_test):.3f}")

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
print(f"R²: {xgb.score(X_test, y_test):.3f}")

In [None]:
import seaborn as sns

predictions = model.predict(X_test)
residuals = y_test - predictions
sns.scatterplot(x=predictions, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')