#### Installing weight and biases library

In [None]:
!pip install wandb



## Loading the dataset: Used Car Price Prediction

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import wandb
import os

In [None]:
student_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=1KC2xE-sOy6JFpNlQlEFjudkVBfBLyNUX" )

In [None]:
student_df.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [103]:
# Update column names
student_df.rename(columns={
    "race/ethnicity": "race_ethnicity",
    "parental level of education": "parental_level_of_education",
    "test preparation course": "test_preparation_course",
    "math score": "math_score",
    "reading score": "reading_score",
    "writing score": "writing_score"
}, inplace=True)


In [106]:
student_df.dtypes

Unnamed: 0,0
gender,object
race_ethnicity,object
parental_level_of_education,object
lunch,object
test_preparation_course,object
reading_score,int64
writing_score,int64
math_score,int64


In [107]:
x_columns = ['gender', 'race_ethnicity', 'parental_level_of_education',
             'lunch', 'test_preparation_course', 'reading_score',
             'writing_score']


In [108]:
student_df.shape

(1000, 8)

In [109]:
student_df = student_df[x_columns + ['math_score']].dropna()

In [111]:
student_df.shape
student_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   reading_score                1000 non-null   int64 
 6   writing_score                1000 non-null   int64 
 7   math_score                   1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


## Identifying numerical and categorical features

In [112]:
cat_features = ['gender', 'race_ethnicity', 'parental_level_of_education',
             'lunch', 'test_preparation_course']

In [113]:
num_features = list(set(x_columns) - set(cat_features))

## Utility method for preparing the data

- Splitting the dataset
- Encoding Catgorical Variables

In [114]:
X = student_df[x_columns]
y = student_df.math_score

In [115]:
from sklearn.model_selection import train_test_split

# Perform a 60-20-20 split
# First, split into train (60%) and temp (40%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Next, split the temp set into test (20%) and production (20%) sets
X_test, X_prod, y_test, y_prod = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Output the sizes of the splits
print(f"Train set: {X_train.shape}, {y_train.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")
print(f"Production set: {X_prod.shape}, {y_prod.shape}")


Train set: (600, 7), (600,)
Test set: (200, 7), (200,)
Production set: (200, 7), (200,)


### Creating ML Pipeline

In [116]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [117]:
ohe_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()

## Creating the imputer for columns that have missing values
imputed_num_vars = ['writing_score']
non_imputed_num_vars = list(set(num_features) - set(imputed_num_vars))
mean_imputer = SimpleImputer(strategy='mean')


## Pipeline for the applying imputation and then scaling
imputed_num_transformer = Pipeline( steps = [
        ('imputation', mean_imputer),
        ('scaler', scaler)])

non_imputed_num_transformer = Pipeline( steps = [('scaler', scaler)])


## Pipeline for OHE encoding the categorical columns
cat_transformer = Pipeline( steps = [('ohencoder', ohe_encoder)])

## The complete pipeline for applying the required transformatinons to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num_imputed', imputed_num_transformer, imputed_num_vars),
        ('num_not_imputed', non_imputed_num_transformer, non_imputed_num_vars),
        ('catvars', cat_transformer, cat_features)])


## Initilializing Weights and Biases

In [118]:
os.environ["WANDB_API_KEY"] = "e3c34be454efd9eb6b608e015cebcc602e8dbc09"

## Baseline Model: Linear Regression

In [119]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from joblib import dump
import wandb
import numpy as np


# 60-20-20 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_prod, y_test, y_prod = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Verify shapes
print(f"Train set: {X_train.shape}, {y_train.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")
print(f"Production set: {X_prod.shape}, {y_prod.shape}")


ohe_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()

# Numerical and categorical transformers
num_transformer = Pipeline(steps=[('scaler', scaler)])
cat_transformer = Pipeline(steps=[('ohe', ohe_encoder)])

# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

# Define the pipeline with preprocessing and linear regression
linear_reg = LinearRegression()

linear_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('linear_model', linear_reg)
])

# Fit the model on the training data
linear_model.fit(X_train, y_train)

# Initialize WandB for tracking
wandb.init(project='mlops_student_performance_assignment_CT1',
           config=None,
           tags=['Linear Model1', 'baseline', 'OHE Encoding'])
wandb.run.name = "LinearModelv1"

# Evaluate the model on the test set
rmse = np.sqrt(mean_squared_error(y_test, linear_model.predict(X_test)))
r2 = linear_model.score(X_test, y_test)

# Log metrics to WandB
wandb.log({
    "rmse": rmse,
    "r2": r2
})

# Save the model as an artifact
artifact = wandb.Artifact(
    name="LinearModelv1",
    type='model',
    description="Linear Regression model for student performance prediction"
)

# Save model to a file and log it as an artifact
model_path = "linear_model_v1.pkl"
dump(linear_model, model_path)
artifact.add_file(model_path)

wandb.log_artifact(artifact)
wandb.finish()

# Print metrics
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 Score: {r2}")


Train set: (600, 7), (600,)
Test set: (200, 7), (200,)
Production set: (200, 7), (200,)




0,1
r2,▁
rmse,▁

0,1
r2,0.85694
rmse,5.59064


Root Mean Squared Error (RMSE): 5.590638552733943
R^2 Score: 0.8569423314385066


In [120]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
import wandb

# Define parameters for Decision Tree
params = {"max_depth": 10}

# Initialize the Decision Tree Regressor
dtree = DecisionTreeRegressor(**params)

# Create a pipeline with preprocessing and decision tree model
dtree_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('dt_model', dtree)
])

# Fit the model on the training data
dtree_model.fit(X_train, y_train)

# Initialize WandB for logging
wandb.init(project='mlops_student_performance_assignment_CT1', config=params,
           tags=['Decision Tree', 'OHE Encoding'])
wandb.run.name = "DecisionTree"

# Evaluate the model on the test set
rmse = np.sqrt(mean_squared_error(y_test, dtree_model.predict(X_test)))
r2 = dtree_model.score(X_test, y_test)

# Log metrics to WandB
wandb.log({
    "rmse": rmse,
    "r2": r2
})

# Save the model as an artifact
artifact = wandb.Artifact(
    name="DecisionTree",
    type='model',
    description=f"Decision Tree model with parameters: {params}"
)

# Save the model to a file and log as an artifact
model_path = "decision_tree_model.pkl"
dump(dtree_model, model_path)
artifact.add_file(model_path)

wandb.log_artifact(artifact)
wandb.finish()

# Print metrics
print(f"Decision Tree RMSE: {rmse}")
print(f"Decision Tree R^2: {r2}")




0,1
r2,▁
rmse,▁

0,1
r2,0.74172
rmse,7.5119


Decision Tree RMSE: 7.511901319258758
Decision Tree R^2: 0.741721615569931


## Manual Grid Search

In [121]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import pandas as pd

# Define parameter grid for the Decision Tree
params = {"dt_model__max_depth": range(5, 10)}

# Initialize a Decision Tree Regressor
dtree = DecisionTreeRegressor()

# Create a pipeline with preprocessing and the Decision Tree model
dtree_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('dt_model', dtree)
])

# Set up GridSearchCV with 10-fold cross-validation and scoring based on R²
dt_grid = GridSearchCV(
    estimator=dtree_model,
    param_grid=params,
    cv=10,
    scoring='r2',
    verbose=1  # To show progress during the search
)


In [122]:

# Fit the GridSearchCV on the training data
dt_grid.fit(X_train, y_train)



Fitting 10 folds for each of 5 candidates, totalling 50 fits


In [123]:
# Fetch and print the best parameters and corresponding score
print(f"Best Parameters: {dt_grid.best_params_}")
print(f"Best R² Score (CV): {dt_grid.best_score_}")



Best Parameters: {'dt_model__max_depth': 5}
Best R² Score (CV): 0.7821367495727068


In [124]:
# Convert grid search results to a DataFrame for analysis
grid_results = pd.DataFrame(dt_grid.cv_results_)



In [125]:
# Display results
grid_results = grid_results.sort_values(by='mean_test_score', ascending=False)
print(grid_results[['params', 'mean_test_score', 'std_test_score']])


                       params  mean_test_score  std_test_score
0  {'dt_model__max_depth': 5}         0.782137        0.047172
1  {'dt_model__max_depth': 6}         0.778237        0.043222
2  {'dt_model__max_depth': 7}         0.752119        0.052676
3  {'dt_model__max_depth': 8}         0.724390        0.045646
4  {'dt_model__max_depth': 9}         0.691879        0.047241


In [126]:
dt_grid.best_score_

0.7821367495727068

In [127]:
grid_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dt_model__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.014079,0.004935,0.006421,0.001641,5,{'dt_model__max_depth': 5},0.840403,0.820684,0.816439,0.750073,0.759806,0.681223,0.814715,0.773734,0.741205,0.823086,0.782137,0.047172,1
1,0.011657,0.001672,0.005311,0.000658,6,{'dt_model__max_depth': 6},0.829183,0.779415,0.81352,0.726075,0.761228,0.687421,0.793382,0.795777,0.764656,0.831716,0.778237,0.043222,2
2,0.011987,0.000962,0.005562,0.00054,7,{'dt_model__max_depth': 7},0.810404,0.715648,0.768744,0.682499,0.730473,0.657631,0.810952,0.742211,0.79421,0.808419,0.752119,0.052676,3
3,0.011494,0.000484,0.005307,0.000338,8,{'dt_model__max_depth': 8},0.770144,0.709949,0.737818,0.635773,0.730527,0.656372,0.75108,0.708432,0.758598,0.785207,0.72439,0.045646,4
4,0.011493,0.000569,0.005226,0.000292,9,{'dt_model__max_depth': 9},0.736841,0.643419,0.730525,0.613457,0.655534,0.648084,0.753912,0.67674,0.724667,0.735608,0.691879,0.047241,5


### Using Sweep Features

In [128]:
import wandb
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

# Define the training function for WandB sweep
def train_decision_tree(config=None):
    # Initialize WandB with the configuration
    with wandb.init(config=config):
        config = wandb.config

        # Initialize the Decision Tree with hyperparameters from the config
        dtree = DecisionTreeRegressor(max_depth=config.max_depth)

        # Create the pipeline with preprocessing and Decision Tree
        dtree_model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('dt_model', dtree)
        ])

        # Fit the model on the training data
        dtree_model.fit(X_train, y_train)

        # Evaluate the model on the test set
        rmse = np.sqrt(mean_squared_error(y_test, dtree_model.predict(X_test)))
        r2 = dtree_model.score(X_test, y_test)

        # Log metrics and hyperparameters to WandB
        wandb.log({
            "rmse": rmse,
            "r2": r2,
            "max_depth": config.max_depth
        })


In [129]:

# Define the Sweep configuration
sweep_config = {
    "method": "grid",  # Can also be 'random' or 'bayes'
    "metric": {"name": "r2", "goal": "maximize"},
    "parameters": {
        "max_depth": {
            "values": [4, 6, 8, 10, 12]  # Depths to evaluate
        }
    },
}



In [130]:
# Initialize the sweep in WandB
sweep_id = wandb.sweep(sweep_config, project="mlops_student_performance_assignment_CT1")



Create sweep with ID: e190xk20
Sweep URL: https://wandb.ai/priyesh-jagtap91-tatatele/mlops_student_performance_assignment_CT1/sweeps/e190xk20


In [131]:

# Run the sweep agent
wandb.agent(sweep_id, function=train_decision_tree)


[34m[1mwandb[0m: Agent Starting Run: 0m66vz4g with config:
[34m[1mwandb[0m: 	max_depth: 4


0,1
max_depth,▁
r2,▁
rmse,▁

0,1
max_depth,4.0
r2,0.71967
rmse,7.82596


[34m[1mwandb[0m: Agent Starting Run: mqqfvd8z with config:
[34m[1mwandb[0m: 	max_depth: 6


0,1
max_depth,▁
r2,▁
rmse,▁

0,1
max_depth,6.0
r2,0.79432
rmse,6.70357


[34m[1mwandb[0m: Agent Starting Run: 8q6nh1au with config:
[34m[1mwandb[0m: 	max_depth: 8


0,1
max_depth,▁
r2,▁
rmse,▁

0,1
max_depth,8.0
r2,0.74123
rmse,7.51907


[34m[1mwandb[0m: Agent Starting Run: u12drrn5 with config:
[34m[1mwandb[0m: 	max_depth: 10


0,1
max_depth,▁
r2,▁
rmse,▁

0,1
max_depth,10.0
r2,0.71684
rmse,7.8654


[34m[1mwandb[0m: Agent Starting Run: ktjdxp42 with config:
[34m[1mwandb[0m: 	max_depth: 12


0,1
max_depth,▁
r2,▁
rmse,▁

0,1
max_depth,12.0
r2,0.73884
rmse,7.55372


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


## Get Experiment Details

### Storing the model into a file

In [132]:
from joblib import dump
import os
import wandb

# Define the directory to save the model
MODEL_DIR = "./student_model"

# Create the directory if it doesn't exist
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

# Save the trained model to the specified directory
dump(linear_model, os.path.join(MODEL_DIR, 'students_linear_model.pkl'))



['./student_model/students_linear_model.pkl']

### Logging the model artifact in the tracking tools (weights and Biases)

In [133]:
# Log the model to WandB
wandb.init(
    project="mlops_student_performance_assignment_CT1",
    config=None,
    tags=['Final Model']
)
wandb.run.name = "FinalModel"




In [134]:
# Create a WandB artifact to log the model
model_artifact = wandb.Artifact(
    "Linear_Model_StudentsPerformance",
    type='model',
    description='Linear Model for predicting student math performance'
)

In [135]:
# Add the model directory to the artifact
model_artifact.add_dir(MODEL_DIR)


[34m[1mwandb[0m: Adding directory to artifact (./student_model)... Done. 0.0s


In [136]:
# Log the artifact to WandB
wandb.run.log_artifact(model_artifact)


<Artifact Linear_Model_StudentsPerformance>

In [137]:
!pip install ydata-profiling

from ydata_profiling import ProfileReport

# Regenerate the profiling report
profile = ProfileReport(student_df, title="Students Performance Profiling Report")
profile_file = "StudentsPerformance_Profile.html"
profile.to_file(profile_file)

print(f"Profiling report saved as {profile_file}")




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling report saved as StudentsPerformance_Profile.html


In [138]:
from google.colab import files

# Download the profiling report
files.download("StudentsPerformance_Profile.html")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [139]:
from google.colab import files

# dataset variable names
student_df.to_parquet("students_data.parquet", index=False)
X_train.to_parquet("train_data.parquet", index=False)
X_test.to_parquet("test_data.parquet", index=False)
X_prod.to_parquet("prod_data.parquet", index=False)

# Save target variables (y values) to separate Parquet files
y_train.to_frame(name="math_score").to_parquet("y_train_data.parquet", index=False)
y_test.to_frame(name="math_score").to_parquet("y_test_data.parquet", index=False)
y_prod.to_frame(name="math_score").to_parquet("y_prod_data.parquet", index=False)


# Download files to local system
print("Downloading files to local system...")
files.download("students_data.parquet")
files.download("train_data.parquet")
files.download("test_data.parquet")
files.download("prod_data.parquet")
files.download("y_train_data.parquet")
files.download("y_test_data.parquet")
files.download("y_prod_data.parquet")


Downloading files to local system...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>