# **READING DATA**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
cars = pd.read_csv('data/used_cars_data2.csv')
cars.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,New_Price,Price,Fuel_Type_Electric,Fuel_Type_Petrol,Transmission_Manual,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third
0,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,19.67,1582.0,126.2,5.0,16.06,12.5,False,False,True,False,False,False
1,Honda Jazz V,Chennai,2011,46000,18.2,1199.0,88.7,5.0,8.61,4.5,False,True,True,False,False,False
2,Maruti Ertiga VDI,Chennai,2012,87000,20.77,1248.0,88.76,7.0,11.27,6.0,False,False,True,False,False,False
3,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,15.2,1968.0,140.8,5.0,53.14,17.74,False,False,False,False,True,False
4,Nissan Micra Diesel XV,Jaipur,2013,86999,23.08,1461.0,63.1,5.0,9.47,3.5,False,False,True,False,False,False


# **SKLEARN STANDARD SCALER**

In [4]:
X = cars.drop(["Name", "Location", "Price"], axis=1)
y = cars["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

TO NOTE: BEFORE PREPROCESSING, TRAIN TEST SPLIT. YOU ARE LEARNING PREPROCESSING FROM TRAIN DATA

In [5]:
print(f"mean: {X_train['Power'].mean()}; std: {X_train['Power'].std()}")

mean: 112.73469645700636; std: 52.7607709047988


In [6]:
from sklearn.preprocessing import StandardScaler
car_Scaler = StandardScaler()
car_Scaler.fit(X_train)

In [7]:
X_train_scaled = car_Scaler.transform(X_train)
print(f"scaled mean: {X_train_scaled.mean()}; scaled std: {X_train_scaled.std()}")

scaled mean: 9.601987668144666e-16; scaled std: 1.0


In [8]:
type(X_train_scaled)

numpy.ndarray

# **ROBUST SCALERS**

Used when data has lots of outliers from the bell curve.
* Uses the median
* Uses IQR to figure out how to scale

In [9]:
from sklearn.preprocessing import RobustScaler, StandardScaler
# 20 normally distributed points with mean 5 and std 3
data = np.random.normal(5, 3, 20)
df1 = pd.DataFrame({"data": data})
print(df1.describe())

            data
count  20.000000
mean    5.506595
std     3.325190
min     0.149702
25%     2.790931
50%     6.174011
75%     7.953566
max    13.591146


In [10]:
# some outliers
outliers = np.array([150, 600, 900])
df2 = pd.DataFrame({
    "data2": np.append(data, outliers)
})
print(df2.describe())

            data2
count   23.000000
mean    76.527474
std    219.360443
min      0.149702
25%      2.808131
50%      6.359417
75%      8.223790
max    900.000000


In [11]:
robust_scaler = RobustScaler().fit(df2)
robust_scaled_data = robust_scaler.transform(df2)

In [12]:
robust_scaled_df = pd.DataFrame({"data": robust_scaled_data.reshape(-1)})
robust_scaled_df.describe()

Unnamed: 0,data
count,23.0
mean,12.956514
std,40.50485
min,-1.146622
25%,-0.655744
50%,0.0
75%,0.344256
max,165.010507


As we can see, it preserves outliers and does not crush
### Lets try regular standard scaler

In [13]:
standard_scaler = StandardScaler().fit(df2)
standard_scaled_data = standard_scaler.transform(df2)
standard_scaled_df = pd.DataFrame({"data": standard_scaled_data.reshape(-1)})
standard_scaled_df.describe()

Unnamed: 0,data
count,23.0
mean,-5.792468000000001e-17
std,1.022475
min,-0.3560092
25%,-0.3436179
50%,-0.3270647
75%,-0.3183746
max,3.838339


looks weird idk.
Also another scaler called MaxAbs Scaler, used for sparceness

# **PIPELINES**

In [14]:
data = pd.read_csv("./data/diabetes.csv")

In [15]:
# Glucose, BMI, Insulin, Skin Thickness, Blood Pressure contains values which are 0
data.loc[data.Glucose == 0, 'Glucose'] = data.Glucose.median()
data.loc[data.BMI == 0, 'BMI'] = data.BMI.median()
data.loc[data.Insulin == 0, 'Insulin'] = data.Insulin.median()
data.loc[data.SkinThickness == 0, 'SkinThickness'] = data.SkinThickness.median()
data.loc[data.BloodPressure == 0, 'BloodPressure'] = data.BloodPressure.median()

  data.loc[data.Insulin == 0, 'Insulin'] = data.Insulin.median()


In [16]:
# x are the dependent variables and y is the target variable
X = data.drop('Outcome',axis=1)
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline

In [18]:
pipe_line = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=13))

In [19]:
pipe_line.fit(X_train, y_train)

In [20]:
print(pipe_line.score(X_test, y_test))

0.7532467532467533


We call the score on pipe_line! It's a model score of the whole pipeline! Not just KNN or SS, all of it!

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pipe_line.predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      0.90      0.83       150
           1       0.72      0.48      0.58        81

    accuracy                           0.75       231
   macro avg       0.74      0.69      0.70       231
weighted avg       0.75      0.75      0.74       231



In [22]:
from sklearn.pipeline import Pipeline

p = Pipeline([
    ('scale', StandardScaler()),
    ('knn', KNeighborsClassifier()),
])

**If we want to specify a parameter WITHIN the pipeline, we use a double underscore after the step, then the parameter you want to change.**

In [23]:
param_grid = {
    "knn__n_neighbors": np.arange(1, 100)
}

In [24]:
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(p, param_grid, n_jobs=-1)
search.fit(X_train, y_train)
print(f"Score with best parameters: {search.best_score_}")
print(search.best_params_)

Score with best parameters: 0.7820872274143303
{'knn__n_neighbors': 19}


# **Pipeline With A Custom sklearn Model to Search Across Models**

In [33]:
from sklearn.base import BaseEstimator
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

class MultiModelClassifier(BaseEstimator):
    """
    A custom Estimator class that can be constructed with different model types.
    For details on implementing custom Estimators,
    see: https://scikit-learn.org/stable/developers/develop.html
    """

    def __init__(self, model=KNeighborsClassifier()):
        """
        A custom estimator parameterized by the model.
        Pass the result of an estimator constructor for `model`. By default,
        it uses the KNeighborsClassifier().
        """
        self.model = model

    def fit(self, X, y=None, **kwargs):
        self.model.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def score(self, X, y):
        return self.model.score(X, y)

In [34]:
p2 = Pipeline([
    ('scale', StandardScaler()),
    ('mmc', MultiModelClassifier()),
])

In [35]:
param_grid = [
    {
        "mmc__model": [KNeighborsClassifier()],
        "mmc__model__n_neighbors": np.arange(1, 100)
    },
    {
        "mmc__model": [RandomForestClassifier()],
        "mmc__model__n_estimators": np.arange(start=20, stop=150, step=3),
    },
]

In [36]:
from sklearn.metrics import accuracy_score
gscv2 = GridSearchCV(p2, param_grid, cv=5)
gscv2.fit(X_train, y_train)

In [37]:
print("scaling best params: ", gscv2.best_params_)
accuracy_test2 = accuracy_score(y_test, gscv2.best_estimator_.predict(X_test))
print(f'Accuracy of best estimator WITH SCALING on test data is: {accuracy_test2}')

scaling best params:  {'mmc__model': RandomForestClassifier(), 'mmc__model__n_estimators': 26}
Accuracy of best estimator WITH SCALING on test data is: 0.7575757575757576


### This is the coolest thing I have ever seen. Would have saved me so much time.....

# **Model Persistence**

Getting into ML ops here. We need a method for saving and quickly reconstituting it. PICKLE? Pickle only works with python, does NOT work when compiling in JAVA. (BTW 'wb' means 'writing bytes')

In [30]:
model = gscv2.best_estimator_

import pickle
with open('my_gscv2_model', 'wb') as f:
    pickle.dump(gscv2, f)

In [31]:
# load the model from disk:
with open('my_gscv2_model', 'rb') as f:
    model = pickle.load(f)

In [32]:
from sklearn.metrics import accuracy_score

accuracy_test=accuracy_score(y_test, model.predict(X_test))
print('Accuracy of loaded model from disk on test data is : {:.2}'.format(accuracy_test))

Accuracy of loaded model from disk on test data is : 0.74


# **Persisting Model Collections with Pickle and Checkpointing Large Searches**

For super large searches, like days at a time, you can do checkpoints! You can stop and restart searches!! You can save intermediate results.

1. Start a grid search program to begin the search.

2. As the search progresses, periodically write the intermediate results to a file using pickle. For example, if the search is considering multiple model types (Logistic Regression, Naive Bayes, Decision Trees, etc.), the program could write the best result for each model type to the file once it completes that search.

3. Stop the program at any time, for example, when computing resources are not available.

4. When the program starts back up again, the first thing it does is check the file to see what models have already been searched through. It then crafts the grid search to pick up where it last left off.

In [40]:
def get_state():
    # load the saved state from disk
    with open("state", 'rb') as f:
        try:
            current_state = pickle.load(f)
        except:
            current_state = None

def save_state(d):
    with open("state", 'wb') as f:
        pickle.dump(d, f)

In [41]:
def add_model_to_state(current_state, model_type, best_model):
    current_state[model_type] = best_model
    return current_state

In [42]:
def get_next_param_grid(current_state):

    # list of models we are interested in training
    models = ["knn", "rf", "nb", "lr"]

    # full param grid that we want to search...
    full_param_grid = {
      "knn":
        {
            "mmc__model": [KNeighborsClassifier()],
            "mmc__model__n_neighbors": np.arange(1, 100)
        },
      "rf":
        {
            "mmc__model": [RandomForestClassifier()],
            "mmc__model__n_estimators": np.arange(start=20, stop=150, step=3),
        },
        # additional entries here...
    }

    for model in models:
        # if the model is already in the current state, then skip it -- we've already
        # searched it previously.
        if model in current_state.keys():
            continue
        # otherwise, we've found the next param grid to search:
        return full_param_grid[model]

    # terminating condition -- if all models have been trained, we're done
    return None

Then, our main program is a loop where we iteratively:

1. Read the file

2. Get the next param grid

3. Train and save the best fit model using GridSearchCV and the save_state function

In [None]:
def main():
    while True:
        current_state = get_state()
        param_grid = get_next_param_grid(current_state)
        if param_grid is None:
            break
        train_and_save_param_grid(param_grid) # ToDo: implement...