<a href="https://colab.research.google.com/github/Victor-Mutuku/End-to-end-Machine-learning-workflow-Pipeline-model-/blob/main/Victor_Mutuku_MLOPs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import pickle
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
#Load Dataset
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
print(X.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  


In [None]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Reprocessing:Imputation + Scaling for numerical features
numerical_features= X.columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [None]:
#Combine preprocessing using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features)
])

In [None]:
#Build pipeline: preprocessing + KNN
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor())
])
#This step preprocess the data hence creates a pipeline structure

In [None]:
#Define the hyperparameter grid
param_grid = {
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance'],
    'knn__p':[1,2]
}

In [None]:
#Apply GridSearchCV with 5-fold cross-validation
grid_search= GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)

In [None]:
#Fit the model
grid_search.fit(X_train, y_train)
#This is the training step of the entire workflow

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
#Evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

#The metrics
r2_score = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse=mean_squared_error = mse**0.5

In [None]:
#Print results
print("Best Hyperparameters:", grid_search.best_params_)
print("Best CV R2 Score:", grid_search.best_score_)
print("R2 Score:", r2_score)
print("MSE:", mse)
print("RMSE:", rmse)

Best Hyperparameters: {'knn__n_neighbors': 7, 'knn__p': 1, 'knn__weights': 'distance'}
Best CV R2 Score: 0.7295904862889294
R2 Score: 0.7180472444331865
MSE: 0.36947342233451436
RMSE: 0.6078432547413143


In [None]:
#Save the pipeline
with open('california_knn_pipeline.pkl', 'wb') as file:
    pickle.dump(best_model, file)