In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/CIDCO FINAL DATASET.csv')

In [None]:
df.head()

Unnamed: 0,application_ID,applicant_name,category,Location,RERA_Number,Nearest_Railway_Station,Distance_from_Railway_Station_km,Type,Carpet_Area_sqft,No_of_Towers,...,distance_category,price_per_sqft,price_efficiency,has_rera,location_popularity,location_type,Nearest_Hospital,Hospital_Distance,Nearest_School,School_Distance
0,'240100828416,Sheetal Shamrao Patil,General,Kharghar (E) Taloja Sector 37,P52000028240,Taloje Panchnand Railway Station,3.0,Ews,322,43,...,moderate,9906.832298,affordable,1,64,standard,Sri Sathya Sai Sanjeevani Hospital,0.57 km,Arqam English School (CBSE),0.77 km
1,'240100139779,Smital Shubham Otari,ST (SCHEDULED TRIBES),Panvel (W) Bus Terminus,P52000028389,Panvel Railway Station,1.0,Lig,322,11,...,near,13012.42236,expensive,1,64,standard,LifeLine Hospital Panvel,1.5 km,St. Joseph’s High School,1.2 km
2,'240100855566,Amol Nana Barkade,ST (SCHEDULED TRIBES),Kharghar Station,P52000033542,Kharghar Railway Station,0.5,Lig A,398,4,...,near,8592.964824,affordable,1,180,popular,MGM Hospital Kharghar,1.0 km,Ryan International School,0.9 km
3,'240100960014,Saurabh Devidas Palve,General,Bamandongri,P52000028209,Bamandongri Railway Station,0.5,Ews,322,3,...,near,11987.57764,affordable,1,62,standard,Matoshree Multispeciality Hospital,2.0 km,Radcliffe School Ulwe,1.8 km
4,'240100640019,Amit Laxman Dhaigude,RM (RELIGIOUS MINORITIES),Kharkopar Plot 3 Sector 16,P52000028420,Kharkopar Railway Station,2.0,Lig,322,19,...,near,12515.52795,affordable,1,59,standard,Indravati Hospital,2.3 km,Radcliffe School,1.5 km


## Dataset preparation


In [None]:
from sklearn.model_selection import train_test_split

X = df[['Carpet_Area_sqft', 'No_of_Towers', 'distance_category', 'location_popularity', 'location_type']]
y = df['price_per_sqft']
X = pd.get_dummies(X, columns=['distance_category', 'location_type'], drop_first=True)

#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (800, 6)
Shape of X_test: (201, 6)
Shape of y_train: (800,)
Shape of y_test: (201,)


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA

# Instantiate models
lr = LinearRegression()
poly = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
ridge = Ridge(random_state=42)
lasso = Lasso(random_state=42)
elastic_net = ElasticNet(random_state=42)
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
gbr_model = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
svr_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
pca = PCA(n_components=5)
pcr_model = make_pipeline(StandardScaler(), pca, LinearRegression())


# Train models
lr.fit(X_train, y_train)
poly.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)
elastic_net.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
gbr_model.fit(X_train, y_train)
svr_model.fit(X_train, y_train)
pcr_model.fit(X_train, y_train) # Train PCR


# Predictions
y_pred_lr = lr.predict(X_test)
y_pred_poly = poly.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
y_pred_elastic_net = elastic_net.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_gbr = gbr_model.predict(X_test)
y_pred_svr = svr_model.predict(X_test)
y_pred_pcr = pcr_model.predict(X_test)


# Evaluation
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

mse_elastic_net = mean_squared_error(y_test, y_pred_elastic_net)
r2_elastic_net = r2_score(y_test, y_pred_elastic_net)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

mse_pcr = mean_squared_error(y_test, y_pred_pcr) # Evaluate PCR
r2_pcr = r2_score(y_test, y_pred_pcr)


# Print results
print(f"Linear Regression - MSE: {mse_lr:.2f}, R²: {r2_lr:.2f}")
print(f"Polynomial Regression (degree=2) - MSE: {mse_poly:.2f}, R²: {r2_poly:.2f}")
print(f"Ridge Regression - MSE: {mse_ridge:.2f}, R²: {r2_ridge:.2f}")
print(f"Lasso Regression - MSE: {mse_lasso:.2f}, R²: {r2_lasso:.2f}")
print(f"Elastic Net Regression - MSE: {mse_elastic_net:.2f}, R²: {r2_elastic_net:.2f}")
print(f"XGBoost Regressor - MSE: {mse_xgb:.2f}, R²: {r2_xgb:.2f}")
print(f"Gradient Boosting Regressor - MSE: {mse_gbr:.2f}, R²: {r2_gbr:.2f}")
print(f"SVR - MSE: {mse_svr:.2f}, R²: {r2_svr:.2f}")
print(f"Principal Component Regression - MSE: {mse_pcr:.2f}, R²: {r2_pcr:.2f}")

Linear Regression - MSE: 8450487.22, R²: 0.42
Polynomial Regression (degree=2) - MSE: 0.00, R²: 1.00
Ridge Regression - MSE: 8459432.07, R²: 0.42
Lasso Regression - MSE: 8451248.23, R²: 0.42
Elastic Net Regression - MSE: 10253979.09, R²: 0.30
XGBoost Regressor - MSE: 380.75, R²: 1.00
Gradient Boosting Regressor - MSE: 466.49, R²: 1.00
SVR - MSE: 3254124.73, R²: 0.78
Principal Component Regression - MSE: 8800238.93, R²: 0.40


## Hyperparameter tuning using GridSearchCV



In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA


#Define parameter grids
param_grid_ridge = {'alpha': [0.1, 1.0, 10.0]}
param_grid_lasso = {'alpha': [0.1, 1.0, 10.0]}
param_grid_elastic_net = {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.2, 0.5, 0.8]}
param_grid_poly = {'polynomialfeatures__degree': [1, 2, 3]} # Tuning the degree of polynomial features
param_grid_xgb = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
param_grid_gbr = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
param_grid_svr = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.1, 1], 'kernel': ['rbf']}
param_grid_pcr = {'pca__n_components': [2, 3, 4, 5]}


lr = LinearRegression()  #no tuning
grid_ridge = GridSearchCV(Ridge(random_state=42), param_grid_ridge, cv=5, scoring='neg_mean_squared_error')
grid_lasso = GridSearchCV(Lasso(random_state=42), param_grid_lasso, cv=5, scoring='neg_mean_squared_error')
grid_elastic_net = GridSearchCV(ElasticNet(random_state=42), param_grid_elastic_net, cv=5, scoring='neg_mean_squared_error')
grid_poly = GridSearchCV(make_pipeline(PolynomialFeatures(), LinearRegression()), param_grid_poly, cv=5, scoring='neg_mean_squared_error')
grid_xgb = GridSearchCV(XGBRegressor(objective='reg:squarederror', random_state=42), param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
grid_gbr = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gbr, cv=5, scoring='neg_mean_squared_error')
grid_svr = GridSearchCV(SVR(), param_grid_svr, cv=5, scoring='neg_mean_squared_error')
grid_pcr = GridSearchCV(make_pipeline(StandardScaler(), PCA(), LinearRegression()), param_grid_pcr, cv=5, scoring='neg_mean_squared_error')


#Train models after hyperparameter tuning
lr.fit(X_train, y_train)

grid_ridge.fit(X_train, y_train)

grid_lasso.fit(X_train, y_train)

grid_elastic_net.fit(X_train, y_train)

grid_poly.fit(X_train, y_train)

grid_xgb.fit(X_train, y_train)

grid_gbr.fit(X_train, y_train)

grid_svr.fit(X_train, y_train)

grid_pcr.fit(X_train, y_train)


y_pred_lr = lr.predict(X_test)
y_pred_ridge = grid_ridge.best_estimator_.predict(X_test)
y_pred_lasso = grid_lasso.best_estimator_.predict(X_test)
y_pred_elastic_net = grid_elastic_net.best_estimator_.predict(X_test)
y_pred_poly = grid_poly.best_estimator_.predict(X_test)
y_pred_xgb = grid_xgb.best_estimator_.predict(X_test)
y_pred_gbr = grid_gbr.best_estimator_.predict(X_test)
y_pred_svr = grid_svr.best_estimator_.predict(X_test)
y_pred_pcr = grid_pcr.best_estimator_.predict(X_test)


mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

mse_elastic_net = mean_squared_error(y_test, y_pred_elastic_net)
r2_elastic_net = r2_score(y_test, y_pred_elastic_net)

mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

mse_pcr = mean_squared_error(y_test, y_pred_pcr)
r2_pcr = r2_score(y_test, y_pred_pcr)


print(f"Linear Regression - MSE: {mse_lr:.2f}, R-squared: {r2_lr:.2f}")
print(f"Ridge Regressor (best params: {grid_ridge.best_params_}) - MSE: {mse_ridge:.2f}, R-squared: {r2_ridge:.2f}")
print(f"Lasso Regressor (best params: {grid_lasso.best_params_}) - MSE: {mse_lasso:.2f}, R-squared: {r2_lasso:.2f}")
print(f"Elastic Net Regressor (best params: {grid_elastic_net.best_params_}) - MSE: {mse_elastic_net:.2f}, R-squared: {r2_elastic_net:.2f}")
print(f"Polynomial Regression (best params: {grid_poly.best_params_}) - MSE: {mse_poly:.2f}, R-squared: {r2_poly:.2f}")
print(f"XGBoost Regressor (best params: {grid_xgb.best_params_}) - MSE: {mse_xgb:.2f}, R-squared: {r2_xgb:.2f}")
print(f"Gradient Boosting Regressor (best params: {grid_gbr.best_params_}) - MSE: {mse_gbr:.2f}, R-squared: {r2_gbr:.2f}")
print(f"SVR (best params: {grid_svr.best_params_}) - MSE: {mse_svr:.2f}, R-squared: {r2_svr:.2f}")
print(f"Principal Component Regression (best params: {grid_pcr.best_params_}) - MSE: {mse_pcr:.2f}, R-squared: {r2_pcr:.2f}")

Linear Regression - MSE: 8450487.22, R-squared: 0.42
Ridge Regressor (best params: {'alpha': 0.1}) - MSE: 8451249.36, R-squared: 0.42
Lasso Regressor (best params: {'alpha': 1.0}) - MSE: 8451248.23, R-squared: 0.42
Elastic Net Regressor (best params: {'alpha': 0.1, 'l1_ratio': 0.8}) - MSE: 8674252.63, R-squared: 0.41
Polynomial Regression (best params: {'polynomialfeatures__degree': 3}) - MSE: 0.00, R-squared: 1.00
XGBoost Regressor (best params: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100}) - MSE: 0.00, R-squared: 1.00
Gradient Boosting Regressor (best params: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}) - MSE: 0.00, R-squared: 1.00
SVR (best params: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}) - MSE: 2391436.51, R-squared: 0.84
Principal Component Regression (best params: {'pca__n_components': 5}) - MSE: 8800238.93, R-squared: 0.40


## Experiment tracking with mlflow



In [None]:
!pip install mlflow --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.1/329.1 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.2/86.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn

#Load your dataset ===
df = pd.read_csv("/content/CIDCO FINAL DATASET.csv")

# Choose target column (replace 'target' with your actual target column name)
target_col = "price_per_sqft"
X = df[['Carpet_Area_sqft', 'No_of_Towers', 'distance_category', 'location_popularity', 'location_type']]
y = df[target_col]

# Identify categorical & numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(include=["int64","float64"]).columns

# Preprocessor
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", drop='first'), cat_cols) # Added drop='first'
])

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Gradient Boosting Regressor Model Pipeline ===
gbr_model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(random_state=42)) # Using the simple GBR model
])

with mlflow.start_run(run_name="Gradient Boosting Regressor"):
    gbr_model_pipeline.fit(X_train, y_train)
    y_pred = gbr_model_pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    # Log params
    gbr = gbr_model_pipeline.named_steps["regressor"]
    mlflow.log_param("n_estimators", gbr.n_estimators)
    mlflow.log_param("learning_rate", gbr.learning_rate)
    mlflow.log_param("max_depth", gbr.max_depth)

    # Log metrics
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # Log pipeline model
    mlflow.sklearn.log_model(gbr_model_pipeline, "Gradient_Boosting_Regressor_Model")

print("Gradient Boosting Regressor model with preprocessing logged to MLflow.")



Gradient Boosting Regressor model with preprocessing logged to MLflow.


In [None]:
#1. Linear Regression
linear_model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

with mlflow.start_run(run_name="Linear Regression"):
    linear_model.fit(X_train, y_train)
    y_pred = linear_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    mlflow.sklearn.log_model(linear_model, "Linear_Regression_Model")

print(" Linear Regression model with preprocessing logged to MLflow.")



 Linear Regression model with preprocessing logged to MLflow.


In [None]:
#2. Ridge Regression
ridge_model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", Ridge(random_state=42))
])

with mlflow.start_run(run_name="Ridge Regression"):
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    mlflow.log_param("alpha", ridge_model.named_steps["regressor"].alpha)

    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    mlflow.sklearn.log_model(ridge_model, "Ridge_Regression_Model")

print("✅ Ridge Regression model with preprocessing logged to MLflow.")



✅ Ridge Regression model with preprocessing logged to MLflow.


In [None]:
# 3. Lasso Regression
lasso_model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", Lasso(random_state=42))
])

with mlflow.start_run(run_name="Lasso Regression"):
    lasso_model.fit(X_train, y_train) # Train the lasso model
    y_pred = lasso_model.predict(X_test) # Predict using the lasso model

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    # Log parameters for the lasso model
    mlflow.log_param("alpha", lasso_model.named_steps["regressor"].alpha)

    # Log metrics for the lasso model
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # Log pipeline model
    mlflow.sklearn.log_model(lasso_model, "Lasso_Regression_Model")

print("✅ Lasso Regression model with preprocessing logged to MLflow.")



✅ Lasso Regression model with preprocessing logged to MLflow.


In [None]:
# 4. Elastic Net Regression
elastic_net_model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", ElasticNet(random_state=42))
])

with mlflow.start_run(run_name="Elastic Net Regression"):
    elastic_net_model.fit(X_train, y_train) # Train the elastic net model
    y_pred = elastic_net_model.predict(X_test) # Predict using the elastic net model

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    # Log parameters for the elastic net model
    mlflow.log_param("alpha", elastic_net_model.named_steps["regressor"].alpha)
    mlflow.log_param("l1_ratio", elastic_net_model.named_steps["regressor"].l1_ratio)


    # Log metrics for the elastic net model
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # Log pipeline model
    mlflow.sklearn.log_model(elastic_net_model, "Elastic_Net_Regression_Model")

print("✅ Elastic Net Regression model with preprocessing logged to MLflow.")



✅ Elastic Net Regression model with preprocessing logged to MLflow.


In [None]:
# 5. Polynomial Regression (degree=2)
poly_model = Pipeline([
    ("preprocessor", preprocessor),
    ("poly_features", PolynomialFeatures(degree=2, include_bias=False)),
    ("regressor", LinearRegression())
])

with mlflow.start_run(run_name="Polynomial Regression (Simple)"):
    poly_model.fit(X_train, y_train) # Train the polynomial model
    y_pred = poly_model.predict(X_test) # Predict using the polynomial model

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    # Log parameters for the polynomial model (e.g., degree)
    mlflow.log_param("degree", poly_model.named_steps["poly_features"].degree)

    # Log metrics for the polynomial model
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # Log pipeline model
    mlflow.sklearn.log_model(poly_model, "Polynomial_Regression_Simple_Model")

print("✅ Simple Polynomial Regression model with preprocessing logged to MLflow.")



✅ Simple Polynomial Regression model with preprocessing logged to MLflow.


In [None]:
# === 6. Support Vector Regressor (SVR) ===
svr_model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", SVR()) # Using default parameters for simple model
])
with mlflow.start_run(run_name="SVR"):
    svr_model.fit(X_train, y_train)
    y_pred = svr_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    # Log parameters for the SVR model
    svr = svr_model.named_steps["regressor"]
    mlflow.log_param("C", svr.C)
    mlflow.log_param("gamma", svr.gamma)
    mlflow.log_param("kernel", svr.kernel)


    # Log metrics
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # Log pipeline model
    mlflow.sklearn.log_model(svr_model, "SVR_Model")

print("✅ SVR model with preprocessing logged to MLflow.")



✅ SVR model with preprocessing logged to MLflow.


In [None]:
# === 7. XGBoost Regressor ===
xgb_model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(random_state=42, verbosity=0)) # Added verbosity=0 to reduce output
])
with mlflow.start_run(run_name="XGBoost Regressor"):
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    # Log params
    xgb = xgb_model.named_steps["regressor"]
    mlflow.log_param("n_estimators", xgb.n_estimators)
    mlflow.log_param("learning_rate", xgb.learning_rate)
    mlflow.log_param("max_depth", xgb.max_depth)

    # Log metrics
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # Log pipeline model
    mlflow.sklearn.log_model(xgb_model, "XGBoost_Regressor_Model")

print("✅ XGBoost Regressor model with preprocessing logged to MLflow.")



✅ XGBoost Regressor model with preprocessing logged to MLflow.


In [None]:
# === 8. Principal Component Regression (PCA + LinearRegression) ===
pcr_model = Pipeline([
    ("preprocessor", preprocessor),
    ("pca", PCA(n_components=5)),   # adjust n_components as needed
    ("regressor", LinearRegression())
])

with mlflow.start_run(run_name="Principal Component Regression"):
    pcr_model.fit(X_train, y_train)
    y_pred = pcr_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    # Log params
    mlflow.log_param("n_components", pcr_model.named_steps['pca'].n_components)


    # Log metrics
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # Log pipeline model
    mlflow.sklearn.log_model(pcr_model, "Principal_Component_Regression_Model")

print("✅ Principal Component Regression model with preprocessing logged to MLflow.")



✅ Principal Component Regression model with preprocessing logged to MLflow.


In [None]:
!pip install pyngrok --quiet

In [None]:
from pyngrok import ngrok

ngrok.set_auth_token("322hdtruWapvhML1KXOicKbthPz_3EFH1kCu7g3LwCUNDsN8w")



In [None]:
import mlflow
import os
from pyngrok import ngrok

# Kill any existing process
os.system("pkill -f mlflow")

# Start MLflow server
get_ipython().system_raw("mlflow ui --host 0.0.0.0 --port 5000 &")

# Expose port 5000 via ngrok
public_url = ngrok.connect(5000)
print("MLflow Tracking UI:", public_url)

MLflow Tracking UI: NgrokTunnel: "https://929a3bff04a6.ngrok-free.app" -> "http://localhost:5000"


In [None]:
import joblib

joblib.dump(xgb_model, "xgb_model.pkl")
print("Simple XGBoost Regressor model saved as xgb_model.pkl")

Simple XGBoost Regressor model saved as xgb_model.pkl
