# **READING DATA**

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [6]:
cars = pd.read_csv('data/used_cars_data2.csv')
cars.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,New_Price,Price,Fuel_Type_Electric,Fuel_Type_Petrol,Transmission_Manual,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third
0,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,19.67,1582.0,126.2,5.0,16.06,12.5,False,False,True,False,False,False
1,Honda Jazz V,Chennai,2011,46000,18.2,1199.0,88.7,5.0,8.61,4.5,False,True,True,False,False,False
2,Maruti Ertiga VDI,Chennai,2012,87000,20.77,1248.0,88.76,7.0,11.27,6.0,False,False,True,False,False,False
3,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,15.2,1968.0,140.8,5.0,53.14,17.74,False,False,False,False,True,False
4,Nissan Micra Diesel XV,Jaipur,2013,86999,23.08,1461.0,63.1,5.0,9.47,3.5,False,False,True,False,False,False


# **SKLEARN STANDARD SCALER**

In [8]:
X = cars.drop(["Name", "Location", "Price"], axis=1)
y = cars["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

TO NOTE: BEFORE PREPROCESSING, TRAIN TEST SPLIT. YOU ARE LEARNING PREPROCESSING FROM TRAIN DATA

In [9]:
print(f"mean: {X_train['Power'].mean()}; std: {X_train['Power'].std()}")

mean: 112.73469645700636; std: 52.7607709047988


In [10]:
from sklearn.preprocessing import StandardScaler
car_Scaler = StandardScaler()
car_Scaler.fit(X_train)

In [11]:
X_train_scaled = car_Scaler.transform(X_train)
print(f"scaled mean: {X_train_scaled.mean()}; scaled std: {X_train_scaled.std()}")

scaled mean: 9.601987668144666e-16; scaled std: 1.0


In [12]:
type(X_train_scaled)

numpy.ndarray

# **ROBUST SCALERS**

Used when data has lots of outliers from the bell curve.
* Uses the median
* Uses IQR to figure out how to scale

In [14]:
from sklearn.preprocessing import RobustScaler, StandardScaler
# 20 normally distributed points with mean 5 and std 3
data = np.random.normal(5, 3, 20)
df1 = pd.DataFrame({"data": data})
print(df1.describe())

            data
count  20.000000
mean    5.917915
std     2.886792
min    -0.044297
25%     5.041117
50%     5.756828
75%     6.918347
max    11.145857


In [15]:
# some outliers
outliers = np.array([150, 600, 900])
df2 = pd.DataFrame({
    "data2": np.append(data, outliers)
})
print(df2.describe())

            data2
count   23.000000
mean    76.885144
std    219.234027
min     -0.044297
25%      5.186099
50%      6.387760
75%      9.014813
max    900.000000


In [16]:
robust_scaler = RobustScaler().fit(df2)
robust_scaled_data = robust_scaler.transform(df2)

In [18]:
robust_scaled_df = pd.DataFrame({"data": robust_scaled_data.reshape(-1)})
robust_scaled_df.describe()

Unnamed: 0,data
count,23.0
mean,18.412811
std,57.26049
min,-1.679952
25%,-0.313855
50%,0.0
75%,0.686145
max,233.397505


As we can see, it preserves outliers and does not crush
### Lets try regular standard scaler

In [19]:
standard_scaler = StandardScaler().fit(df2)
standard_scaled_data = standard_scaler.transform(df2)
standard_scaled_df = pd.DataFrame({"data": standard_scaled_data.reshape(-1)})
standard_scaled_df.describe()

Unnamed: 0,data
count,23.0
mean,-1.930823e-17
std,1.022475
min,-0.3587874
25%,-0.3343936
50%,-0.3287893
75%,-0.3165371
max,3.838885


looks weird idk.
Also another scaler called MaxAbs Scaler, used for sparceness

# **PIPELINES**

In [20]:
data = pd.read_csv("./data/diabetes.csv")

In [21]:
# Glucose, BMI, Insulin, Skin Thickness, Blood Pressure contains values which are 0
data.loc[data.Glucose == 0, 'Glucose'] = data.Glucose.median()
data.loc[data.BMI == 0, 'BMI'] = data.BMI.median()
data.loc[data.Insulin == 0, 'Insulin'] = data.Insulin.median()
data.loc[data.SkinThickness == 0, 'SkinThickness'] = data.SkinThickness.median()
data.loc[data.BloodPressure == 0, 'BloodPressure'] = data.BloodPressure.median()

  data.loc[data.Insulin == 0, 'Insulin'] = data.Insulin.median()


In [22]:
# x are the dependent variables and y is the target variable
X = data.drop('Outcome',axis=1)
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline

In [24]:
pipe_line = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=13))

In [25]:
pipe_line.fit(X_train, y_train)

In [26]:
print(pipe_line.score(X_test, y_test))

0.7532467532467533


We call the score on pipe_line! It's a model score of the whole pipeline! Not just KNN or SS, all of it!

In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pipe_line.predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      0.90      0.83       150
           1       0.72      0.48      0.58        81

    accuracy                           0.75       231
   macro avg       0.74      0.69      0.70       231
weighted avg       0.75      0.75      0.74       231

