In [1]:
import seaborn as sns

df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])

In [8]:
X = df.drop('time', axis = 1)
y = df['time']

In [10]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.20, random_state = 42)

## Pipeline
Automating the machine learning workflow by enabling data to be transformed and correlated into a model

In [12]:
from sklearn.impute import SimpleImputer # Handle missing values
from sklearn.preprocessing import OneHotEncoder # Handle categorical values
from sklearn.preprocessing import StandardScaler # Feature scaling

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
categorical_cols = ['sex', 'smoker', 'day']
numerical_cols = ['total_bill', 'tip', 'size']

In [15]:
# Feature Engineering Automation

num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')), # Handling missing values
        ('scaler', StandardScaler()) # Feature scaling
    ]
)

cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')), # Handling missing values
        ('oneHotEncoder', OneHotEncoder()) #Handling categorical values
    ]
)

In [16]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])

In [18]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Random Forest Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
models = (
    'Random Forest' : RandomForestClassifier()
)