# Homework 12

https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning

* Implement a mini batch functionality to train a regressor.
    - (Optional) If anyone want to do this in a pipeline can do this: https://koaning.github.io/tokenwiser/api/pipeline.html

* Save model, load the model again and test it on `X_test`

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
def test_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)

    df = df.sample(5000, random_state=100).reset_index(drop=True)
    
    y = df['sellingprice']
    df.drop('sellingprice', axis=1, inplace=True)
    X = df
    
    return X,y

def partial_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)
   
    while(True):
        yield df.sample(100).reset_index(drop=True)
        
gen = partial_df()

In [3]:
X_test, y_test = test_df()

In [4]:
# each time you call this you will get a new slice of the dataframe.
next(gen)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2004,Toyota,Corolla,LE,Sedan,automatic,2t1br32e94c274690,nj,1.9,175029.0,beige,gray,united autoland inc,3175,2900,Wed Mar 04 2015 01:30:00 GMT-0800 (PST)
1,2005,Nissan,Murano,S,SUV,automatic,jn8az08t55w326071,fl,3.2,106466.0,beige,tan,q auto,5375,4800,Wed Feb 18 2015 08:40:00 GMT-0800 (PST)
2,2014,Chevrolet,Captiva Sport,LS Fleet,SUV,automatic,3gnal2ek7es652137,va,,12753.0,burgundy,gray,avis rac system/pv holding corp,14900,10000,Thu Dec 18 2014 11:50:00 GMT-0800 (PST)
3,2002,Mercury,Grand Marquis,LSE,Sedan,automatic,2mefm75w52x635254,fl,3.6,62508.0,blue,blue,east coast financial,3750,3400,Fri Feb 27 2015 01:25:00 GMT-0800 (PST)
4,2006,Chevrolet,TrailBlazer,LS,SUV,automatic,1gndt13s862187424,oh,3.4,87901.0,blue,gray,dt inventory,5050,5200,Thu May 21 2015 02:00:00 GMT-0700 (PDT)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2012,Ford,Fusion,SEL,Sedan,automatic,3fahp0ja2cr334353,nj,4.1,50773.0,—,beige,wheels exchange llc,11550,10400,Wed May 27 2015 02:30:00 GMT-0700 (PDT)
96,2012,Ford,Fusion,SPORT,Sedan,,3fahp0dc3cr221721,pa,3.8,47520.0,black,black,"ford motor credit company,llc",16350,15700,Fri May 22 2015 02:00:00 GMT-0700 (PDT)
97,2005,Dodge,Durango,Limited,SUV,automatic,1d8hb58d45f602326,mi,2.9,145185.0,blue,gray,lafontaine chevrolet inc,4300,4600,Thu Mar 05 2015 01:20:00 GMT-0800 (PST)
98,2013,Chevrolet,Malibu,Eco,Sedan,automatic,1g11f5rr4df105251,fl,1.9,27051.0,gray,black,fiserv/global lending services,13850,10000,Tue Jan 27 2015 01:30:00 GMT-0800 (PST)


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline

In [6]:
# Define the missing value imputation strategies
# Preprocessing
numeric_features = ['year', 'odometer']
categorical_features = ['make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'condition', 'color', 'interior', 'seller', 'saledate']
numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Define the transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', numeric_imputer),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Fit the preprocessor on the training data
preprocessor.fit(X_test)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['year', 'odometer']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['make', 'model', 'trim', 'body',
                                  'transmission', 'vin', 'state', 'condition',
                                  'color', 'interior', 'seller', 'saledate'])])

In [7]:
# Create the SGDRegressor
regressor = SGDRegressor(loss='squared_error', random_state=100)

In [8]:
# Mini-batch training loop
batch_size = 50
num_batches = 100

for _ in range(num_batches):
    batch_df = next(gen)
    X_batch = batch_df.drop('sellingprice', axis=1)
    y_batch = batch_df['sellingprice']
    
    # Apply preprocessor to the mini-batch
    X_batch_processed = preprocessor.transform(X_batch)
    
    # Handle missing values in the mini-batch
    X_batch_processed = np.nan_to_num(X_batch_processed)
    
    regressor.partial_fit(X_batch_processed, y_batch)

In [9]:
# Save the model
import joblib
joblib.dump(regressor, 'regressor_model.joblib')

# Load the model
loaded_regressor = joblib.load('regressor_model.joblib')

# Preprocess the test data
X_test_processed = preprocessor.transform(X_test)

# Test the model on X_test
y_pred = loaded_regressor.predict(X_test_processed)

# Print the predicted values
print(y_pred)

[11694.21052052 19245.51189901 20178.58898878 ... 18770.47259858
 18296.93736075 17866.2052536 ]
