# Homework 12

https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning

* Implement a mini batch functionality to train a regressor.
    - (Optional) If anyone want to do this in a pipeline can do this: https://koaning.github.io/tokenwiser/api/pipeline.html

* Save model, load the model again and test it on `X_test` __Do NOT commit the pickle file__

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
def test_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)

    df = df.sample(5000, random_state=100).reset_index(drop=True)
    
    y = df['sellingprice']
    df.drop('sellingprice', axis=1, inplace=True)
    X = df
    
    return X,y

def partial_df():
    df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv', low_memory=False)
   
    while(True):
        yield df.sample(100).reset_index(drop=True)
        
gen = partial_df()

In [3]:
X_test, y_test = test_df()

In [4]:
# each time you call this you will get a new slice of the dataframe.
sample_df = next(gen)
sample_df

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2012,Nissan,Altima,2.5 S,Sedan,automatic,1n4al2ap1cn400858,mo,3.3,39835.0,black,black,nissan-infiniti lt,11950,12800,Wed Dec 31 2014 10:30:00 GMT-0800 (PST)
1,2005,Nissan,Maxima,3.5 SL,Sedan,automatic,1n4ba41e55c837149,va,2,187631.0,silver,gray,credit acceptance corp/vrs/southfield,2225,2400,Thu Feb 12 2015 01:05:00 GMT-0800 (PST)
2,2011,Nissan,Altima,2.5 S,Coupe,automatic,1n4al2ep0bc128243,fl,3,70920.0,blue,black,santander consumer,9550,9700,Wed Jan 28 2015 01:05:00 GMT-0800 (PST)
3,2002,GMC,Envoy,SLT,SUV,automatic,1gkdt13s022414743,in,3.6,162858.0,blue,gray,select remarketing group llc/loan max title,2250,2600,Wed Jan 28 2015 05:15:00 GMT-0800 (PST)
4,2011,Toyota,Venza,Base,Wagon,,4t3bk3bbxbu054794,on,4,22375.0,black,gray,repo depo,22600,22200,Tue Mar 03 2015 02:00:00 GMT-0800 (PST)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2012,Lexus,ES 350,Base,Sedan,automatic,jthbk1eg3c2474983,ca,3.7,23006.0,silver,black,lexus financial services,22600,25250,Wed Feb 04 2015 04:30:00 GMT-0800 (PST)
96,2014,Ford,Taurus,Limited,Sedan,,1fahp2f85eg163612,pa,3.5,33394.0,silver,black,avis budget group,19900,16800,Fri Apr 17 2015 02:00:00 GMT-0700 (PDT)
97,2012,BMW,3 Series,335i,Coupe,automatic,wbakg7c56ce802657,ca,4.1,21462.0,blue,gray,financial services remarketing (lease),29300,28500,Thu May 21 2015 05:30:00 GMT-0700 (PDT)
98,2007,Chevrolet,Express Cargo,3500,Van,,1gchg35ux71223694,nc,2.4,273447.0,white,gray,fleet lease remarketing,1450,3800,Mon Mar 02 2015 01:30:00 GMT-0800 (PST)


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline

# Defing Categorical and Numerical Columns

In [6]:
categorical_cols = ['make', 'model','condition', 'trim', 'body', 'transmission','vin','mmr','state','color', 'interior', 'seller','saledate']
numerical_cols = ['odometer','year'] 

# Defining Pipelines

In [7]:
numerical_pipeline = Pipeline([('impute_missing', SimpleImputer(strategy='median')),
                           ('standardize_num', StandardScaler())
                        ])
categorical_pipeline = Pipeline([('impute_missing_cats', SimpleImputer(strategy='most_frequent')),
                          ('create_dummies_cats', OneHotEncoder(handle_unknown='ignore',))])

processing_pipeline = ColumnTransformer(transformers=[('proc_numeric', numerical_pipeline, numerical_cols),
                                                      ('create_dummies', categorical_pipeline, categorical_cols)])

processing_pipeline.fit(X_test)

ColumnTransformer(transformers=[('proc_numeric',
                                 Pipeline(steps=[('impute_missing',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardize_num',
                                                  StandardScaler())]),
                                 ['odometer', 'year']),
                                ('create_dummies',
                                 Pipeline(steps=[('impute_missing_cats',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('create_dummies_cats',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['make', 'model', 'condition', 'trim', 'body',
                                  'transmission', 'vin', 'mmr', 'state',
                                  'color', 'interior', 'seller', 's

# Defining Regressor

In [8]:
SGD = SGDRegressor(loss='squared_error', random_state=100)

# Mini batch functionality to train a regressor

In [9]:
batch_size = 50
num_batches = 100
n_epoch = 100
for epoch in range(n_epoch):
    for _ in range(num_batches):
        sample_df = next(gen)
        X_batch = sample_df.drop('sellingprice', axis=1)
        y_batch = sample_df['sellingprice']

        # Transform the mini batch features into required format
        X_batch_final = processing_pipeline.transform(X_batch)

        # Replacing thr nulls 
        X_batch_final = np.nan_to_num(X_batch_final)
        # fitting the data against the model
        SGD.partial_fit(X_batch_final, y_batch)

# Save the model

In [10]:

import joblib
joblib.dump(SGD, 'model.pkl')

# Load the model
loaded_SGD = joblib.load('model.pkl')

# Preprocess the test data
X_test_cleaned = processing_pipeline.transform(X_test)

# Test the model on X_test
y_pred = loaded_SGD.predict(X_test_cleaned)

# Print the predicted values
print(y_pred)

[13424.33193609 19702.21979142 14886.93071012 ... 20297.3584998
 19616.55606032 16599.98314149]


In [11]:
score = loaded_SGD.score(X_test_cleaned,y_test) 
print('The Score of the model is:',score)

The Score of the model is: 0.8322582834638906
