# Homework 12

https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning

* Implement a mini batch functionality to train a regressor.
    - (Optional) If anyone want to do this in a pipeline can do this: https://koaning.github.io/tokenwiser/api/pipeline.html

* Save model, load the model again and test it on `X_test` __Do NOT commit the pickle file__

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
def test_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)

    df = df.sample(5000, random_state=100).reset_index(drop=True)

    y = df['sellingprice']
    df.drop('sellingprice', axis=1, inplace=True)
    X = df

    return X,y

def partial_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)

    while(True):
        yield df.sample(100).reset_index(drop=True)

gen = partial_df()

In [3]:
X_test, y_test = test_df()

In [4]:
# each time you call this you will get a new slice of the dataframe.
next(gen)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,1997,Nissan,Maxima,SE,Sedan,manual,jn1ca21d2vt842869,ca,2.8,190884.0,white,gray,trade in solutions irvine,850,1000,Wed Feb 04 2015 04:45:00 GMT-0800 (PST)
1,2012,Infiniti,G Sedan,G37x,G Sedan,automatic,jn1cv6ar3cm971668,mo,4.4,29477.0,black,black,nissan infiniti lt,20500,21800,Wed Feb 04 2015 02:30:00 GMT-0800 (PST)
2,2014,Ford,Fiesta,SE,Hatchback,automatic,3fadp4ej2em125503,ca,1,38683.0,gray,black,dollar thrifty damage liquidation,10100,7500,Wed Feb 25 2015 07:05:00 GMT-0800 (PST)
3,2014,Ford,F-250 Super Duty,Platinum,crew cab,automatic,1ft7w2btxeeb71767,mi,3.4,22583.0,black,brown,automobiles paille inc,49900,47000,Thu Jun 11 2015 02:30:00 GMT-0700 (PDT)
4,2014,Ford,Fusion,SE,Sedan,automatic,3fa6p0hd6er160054,mo,4.1,43258.0,—,black,ars/avis budget group,15050,14600,Tue Feb 03 2015 02:30:00 GMT-0800 (PST)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2012,Toyota,Camry,XLE,Sedan,,4t1bf1fk9cu164345,nj,4.2,38693.0,—,black,wheels exchange llc,12750,14500,Wed Jan 21 2015 01:30:00 GMT-0800 (PST)
96,2012,Ford,F-150,Lariat,SuperCrew,automatic,1ftfw1ct2ckd22569,tx,4.2,60336.0,white,tan,"ford motor credit company,llc",26100,25800,Wed May 27 2015 03:30:00 GMT-0700 (PDT)
97,2014,Dodge,Grand Caravan,SXT,Minivan,automatic,2c4rdgcg4er181825,fl,4.2,36437.0,white,black,avis corporation,16400,16500,Fri Mar 06 2015 02:00:00 GMT-0800 (PST)
98,2012,GMC,Acadia,SL,SUV,automatic,1gkkrned1cj366169,ga,4.3,33360.0,blue,gray,t. james motorsports llc,17600,18800,Tue Jan 20 2015 01:30:00 GMT-0800 (PST)


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
import numpy as np
import pickle

### Loading the data

In [6]:
def load_data():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)
    df = df.sample(5000, random_state=100).reset_index(drop=True)
    X = df.drop('sellingprice', axis=1)
    y = df['sellingprice']
    return X, y

### Function to generate mini-batches

In [7]:
def generate_mini_batches(X, y, batch_size):
    num_samples = X.shape[0]
    indices = np.arange(num_samples)
    np.random.shuffle(indices)
    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)
        batch_indices = indices[start_idx:end_idx]
        yield X.iloc[batch_indices], y.iloc[batch_indices]

### Function to preprocess data and train the model

In [8]:
def train_model(X_train, y_train, batch_size=100, epochs=5):
   
    # preprocessing steps for numerical and categorical features
    categorical_cols = X_train.select_dtypes(include=['object']).columns
    numerical_cols = X_train.select_dtypes(exclude=['object']).columns

    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # linear regression model
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', SGDRegressor())
    ])

    # Training the model using mini-batch
    for epoch in range(epochs):
        for X_batch, y_batch in generate_mini_batches(X_train, y_train, batch_size):
            model.fit(X_batch, y_batch)
        print(f"Epoch {epoch+1}/{epochs} completed")
    return model


### Function to save the model

In [9]:
def save_model(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
    print("Model saved successfully.")


### Function to load the saved model

In [10]:
def load_model(filename):
    with open(filename, 'rb') as file:
        model = pickle.load(file)
    print("Model loaded successfully.")
    return model

### Function to test the loaded model

In [11]:
def test_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    print("Predictions vs Actual:")
    print(pd.DataFrame({'Prediction': predictions[:5], 'Actual': y_test[:5]}))


In [12]:
#Loading Data
X, y = load_data()

In [13]:
# Train - Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training the model

In [14]:
model = train_model(X_train, y_train)



Epoch 1/5 completed




Epoch 2/5 completed




Epoch 3/5 completed






Epoch 4/5 completed




Epoch 5/5 completed




In [15]:
save_model(model, 'trained_model_mini_batchesSGD.pkl')

Model saved successfully.


In [16]:
loaded_model = load_model('trained_model_mini_batchesSGD.pkl')


Model loaded successfully.


In [17]:
test_model(loaded_model, X_test, y_test)

Predictions vs Actual:
        Prediction  Actual
1501  11751.727884   11600
2586  15163.460856   17400
2653   7463.649794    7800
1055  13793.344529   14601
705    5626.174408    5400
