<a href="https://colab.research.google.com/github/Sankytanky100/End_to_End_ML_Project/blob/main/ML_Pipeline_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

## Loading the dataset
columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)
## Defining target and predictor variables
y = df.age
X = df.drop(columns=['age'])

## Numerical columns:
num_cols = X.select_dtypes(include=np.number).columns
## Categorical columns
cat_cols = X.select_dtypes(include=['object']).columns

## Create some missing values
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

## Perform train-test split
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

#####-------Imputation and Scaling: Code base to transform -----------------#####
## Numerical training data
x_train_num = x_train[num_cols]
# Filling in missing values with mean on numeric features only
x_train_fill_missing = x_train_num.fillna(x_train_num.mean())
## Fitting standard scaler on x_train_fill_missing
scale = StandardScaler().fit(x_train_fill_missing)
## Scaling data after filling in missing values
x_train_fill_missing_scale = scale.transform(x_train_fill_missing)
## Same steps as above, but on the test set:
x_test_fill_missing = x_test[num_cols].fillna(x_train_num.mean())
x_test_fill_missing_scale = scale.transform(x_test_fill_missing)
#####-------Imputation and Scaling: Code base to transform -----------------#####

#1. Rewrite using Pipelines!
pipeline = Pipeline([("imputer",SimpleImputer(strategy='mean')), ("scale",StandardScaler())])

#2. Fit pipeline on the test and compare results
pipeline.fit(x_train[num_cols])
x_transform = pipeline.transform(x_test[num_cols])

# 3. Verify pipeline transform test set is the same by using np.array_equal()
array_diff= np.array_equal(x_transform,x_test_fill_missing_scale)
print(array_diff)

#4. Change imputer strategy to median
pipeline_median =Pipeline([("imputer",SimpleImputer(strategy='median')), ("scale",StandardScaler())])
pipeline_median.fit(x_train[num_cols])

# 5 Compare results between the two pipelines
x_transform_median = pipeline_median.transform(x_test[num_cols])
new_array_diff = abs(x_transform-x_transform_median).sum()
print(new_array_diff)

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)

y = df.age
X=df.drop(columns=['age'])
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns
#create some missing values
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)
x_train_cat = x_train[cat_cols]
#fill missing values with mode on categorical features only
x_train_fill_missing = x_train_cat.fillna(x_train_cat.mode().values[0][0])
#apply one hot encoding on x_train_fill_missing
ohe = OneHotEncoder(sparse=False, drop='first').fit(x_train_fill_missing)
#transform data after filling in missing values
x_train_fill_missing_ohe = ohe.transform(x_train_fill_missing)

#Now want to do the same thing on the test set!
x_test_fill_missing = x_test[cat_cols].fillna(x_train_cat.mode().values[0][0])
x_test_fill_missing_ohe = ohe.transform(x_test_fill_missing)

#1. Rewrite using Pipelines!
pipeline = Pipeline([("imputer",SimpleImputer(strategy='most_frequent')), ("ohe",OneHotEncoder(sparse=False, drop='first'))])


#2. Fit the pipeline and transform the test data (categorical columns only!)
pipeline.fit(x_train[cat_cols])
x_transform = pipeline.transform(x_test[cat_cols])

#3. Check if the two arrays are the same using np.array_equal()
check_arrays = np.array_equal(x_transform,x_test_fill_missing_ohe)
print('Are the arrays equal?')
print(check_arrays)


In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)

y = df.age
X=df.drop(columns=['age'])
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns
#create some missing values
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

#1. Create a pipeline `num_vals` to process numerical data

num_vals = Pipeline([("imputer",SimpleImputer()), ("scale",StandardScaler())])

#2. Create a pipeline `cat_vals` to process categorical data
cat_vals = Pipeline([("imputer",SimpleImputer(strategy = 'most_frequent')), ("ohe",OneHotEncoder(drop = 'first', sparse = False))])


#3. Create a column transformer, `preprocess` with the numerical and categorical pipelines
preprocess = ColumnTransformer(
    transformers=[
        ("num_preprocess", num_vals, num_cols),
        ("cat_preprocess", cat_vals, cat_cols)
    ]
)


#4. Fit the preprocess transformer to training data
preprocess.fit(x_train)
#Transform the test data
x_transform = preprocess.transform(x_test)




