In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
from sklearn.datasets import fetch_california_housing


In [2]:
# Load Dataset
data=fetch_california_housing()
df=pd.DataFrame(data.data, columns=data.feature_names)
df["price"]=data.target


In [4]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [7]:

# Exploratory Data Analysis
print(df.info())
print(df.describe())
print(df.isnull().sum())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   price       20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
None
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
2

In [9]:
# Feature Selection
numerical_features=data.feature_names

ohe=OneHotEncoder(handle_unknown='ignore')
scaler=StandardScaler()

preprocessor=ColumnTransformer([
    ('num',scaler,numerical_features)
])



In [11]:
# Train-Test Split
X=df.drop(columns=["price"])
y=df["price"]
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# Model Training
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor())
])

params={'model__n_estimators': [50, 100], 'model__max_depth': [None, 10]}
gs=GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2')
gs.fit(X_train, y_train)



In [12]:
# Evaluation
y_pred=gs.best_estimator_.predict(X_test)
print("MAE:", mean_absolute_error(y_test,y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test,y_pred)))
print("R2 Score:", r2_score(y_test,y_pred))



MAE: 0.32677581722383736
RMSE: 0.5033209860351934
R2 Score: 0.8066771372983772


In [13]:
# Save Model
with open("model.pkl", "wb") as f:
    pickle.dump(gs.best_estimator_, f)

In [38]:
!pip install flask





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [23]:
from flask import Flask, request, jsonify
import pickle
import numpy as np


In [59]:
%%writefile app.py
from flask import Flask, request, jsonify
import pickle
import numpy as np
import pandas as pd

# Load the trained model
with open("model.pkl", "rb") as f:
    model = pickle.load(f)

# Define feature names (must match training features)
expected_columns = [
    "MedInc", "HouseAge", "AveRooms", "AveBedrms", 
    "Population", "AveOccup", "Latitude", "Longitude"
]

# Initialize Flask app
app = Flask(__name__)

@app.route("/predict", methods=["POST"])
def predict():
    try:
        data = request.get_json()
        
        # Ensure data is a pandas DataFrame with correct column names
        features = pd.DataFrame([data["features"]], columns=expected_columns)
        
        # Predict
        prediction = model.predict(features)[0]
        return jsonify({"predicted_price": prediction})

    except Exception as e:
        return jsonify({"error": str(e)})

if __name__ == "__main__":
    app.run(debug=True)



Overwriting app.py


In [40]:
import sys
print(sys.executable)


C:\Users\user\anaconda3\python.exe


In [42]:
import sys
print(sys.prefix)


C:\Users\user\anaconda3


In [44]:
import importlib.util
print(importlib.util.find_spec("flask"))


ModuleSpec(name='flask', loader=<_frozen_importlib_external.SourceFileLoader object at 0x00000158FD9A2050>, origin='C:\\Users\\user\\anaconda3\\Lib\\site-packages\\flask\\__init__.py', submodule_search_locations=['C:\\Users\\user\\anaconda3\\Lib\\site-packages\\flask'])


In [54]:
import requests

url="http://127.0.0.1:5000/predict"
data={
    "features": [8.3252, 41, 6.984127, 1.02381, 322, 2.555556, 37.88, -122.23]
}

response=requests.post(url, json=data)
print(response.json())


{'predicted_price': 4.096811900000001}
