<a href="https://colab.research.google.com/github/Siyaram322/numpy_project/blob/main/Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('housing.csv.zip')


# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

# Assuming 'median_house_value' is the target column name, remove it from features if present
if 'median_house_value' in numerical_cols:
    numerical_cols.remove('median_house_value')

# 2. Handling Missing Values and Feature Engineering
# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define Features (X) and Target (y)
X = df.drop('median_house_value', axis=1) # Corrected column name
y = df['median_house_value'] # Corrected column name

# 3. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Apply preprocessing (imputation, scaling, one-hot encoding)
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Data Preprocessing Complete.")

Data Preprocessing Complete.


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

# Define the models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR(kernel='rbf')
}

# Dictionary to store results
results = []

# Loop through each model
for name, model in models.items():
    # Train
    model.fit(X_train_processed, y_train)

    # Predict
    predictions = model.predict(X_test_processed)

    # Evaluate
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    results.append({"Model": name, "MSE": mse, "MAE": mae, "R2": r2})

# Convert results to a DataFrame for comparison
results_df = pd.DataFrame(results).sort_values(by="R2", ascending=False)
print("\nModel Comparison Table:")
print(results_df)


Model Comparison Table:
               Model           MSE           MAE        R2
2      Random Forest  2.395290e+09  31628.407311  0.817210
3  Gradient Boosting  3.125159e+09  38278.148174  0.761513
1      Decision Tree  4.785287e+09  43604.014293  0.634825
0  Linear Regression  4.908291e+09  50670.489236  0.625438
4                SVR  1.366967e+10  87042.383432 -0.043161
