## Homework 12

https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning

Implement a mini batch functionality to train a regressor.

(Optional) If anyone want to do this in a pipeline can do this: https://koaning.github.io/tokenwiser/api/pipeline.html

Save model, load the model again and test it on X_test Do NOT commit the pickle file

In [93]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
# Function to load and preprocess data
def test_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)

    df = df.sample(5000, random_state=100).reset_index(drop=True)
    
    y = df['sellingprice']
    X = df.drop('sellingprice', axis=1)
    
    return X, y

# Generator to yield mini-batches of the data
def partial_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)

    while True:
        yield df.sample(100).reset_index(drop=True)

# Initialize the generator
gen = partial_df()

In [80]:
X_test, y_test = test_df()

# Split the data into numerical and categorical features
categorical_features = X_test.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_test.select_dtypes(exclude=['object']).columns.tolist()

# Create a preprocessor for the pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values by taking the mean
            ('pass', 'passthrough')  # Pass numerical features as is after imputation
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
            ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot Encoding to categorical features
        ]), categorical_features)
    ])

# Initialize the regressor pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Number of mini-batches
n_batches = 50  # For example, train on 50 mini-batches

for _ in range(n_batches):
    # Get a mini-batch
    mini_batch = next(gen)
    
    # Separate features and target
    y_batch = mini_batch['sellingprice']
    X_batch = mini_batch.drop('sellingprice', axis=1)
    
    # Fit the model on the mini-batch
    model.fit(X_batch, y_batch)

# After training on mini-batches, evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')

Mean Squared Error on Test Set: 47302513.22288998


In [84]:
import joblib 

# Save the model to a file
joblib.dump(model, 'car_price_model.joblib')
print("Model saved as 'car_price_model.joblib'.")

# Load the model from the file
loaded_model = joblib.load('car_price_model.joblib')
print("Model loaded from 'car_price_model.joblib'.")

# Test the loaded model on X_test
y_pred = loaded_model.predict(X_test)

Model saved as 'car_price_model.joblib'.
Model loaded from 'car_price_model.joblib'.


In [86]:
# Load the test dataset
def load_test_data():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)
    df = df.sample(5000, random_state=100).reset_index(drop=True)
    y = df['sellingprice']
    X = df.drop('sellingprice', axis=1)
    return X, y

# Load the model
loaded_model = joblib.load('car_price_model.joblib')
print("Model loaded from 'car_price_model.joblib'.")

# Load the test data
X_test, y_test = load_test_data()

# Test the loaded model on X_test
y_pred = loaded_model.predict(X_test)

Model loaded from 'car_price_model.joblib'.


In [88]:
# Print an example of a prediction
example_index = 0  # Change this to see predictions for different examples
predicted_value = y_pred[example_index]
actual_value = y_test.iloc[example_index]

print(f"Example prediction: Predicted selling price = ${predicted_value:.2f}, Actual selling price = ${actual_value:.2f}")

Example prediction: Predicted selling price = $10753.01, Actual selling price = $8000.00
