In [1]:
import seaborn as sns

df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])

In [8]:
X = df.drop('time', axis = 1)
y = df['time']

In [10]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.20, random_state = 42)

## Pipeline
Automating the machine learning workflow by enabling data to be transformed and correlated into a model

In [12]:
from sklearn.impute import SimpleImputer # Handle missing values
from sklearn.preprocessing import OneHotEncoder # Handle categorical values
from sklearn.preprocessing import StandardScaler # Feature scaling

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
categorical_cols = ['sex', 'smoker', 'day']
numerical_cols = ['total_bill', 'tip', 'size']

In [15]:
# Feature Engineering Automation

num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')), # Handling missing values
        ('scaler', StandardScaler()) # Feature scaling
    ]
)

cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')), # Handling missing values
        ('oneHotEncoder', OneHotEncoder()) #Handling categorical values
    ]
)

In [16]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])

In [18]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Random Forest Classifier

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

models = {
    'Random Forest' : RandomForestClassifier(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Support Vector Machine' : SVC()
}

In [45]:
from sklearn.metrics import accuracy_score

In [46]:
def evaluate_model(X_train, y_train, X_test, y_test, models) :
    report ={}
    for i in range(len(models)) :
        
        # Train model
        model = list(models.values())[i]
        model.fit(X_train, y_train)

        # Predict Testing data
        y_test_pred = model.predict(X_test)

        # Get accuracy for test data prediction
        test_model_score = accuracy_score(y_test, y_test_pred)

        report[list(models.keys())[i]] = test_model_score

    return report

In [47]:
evaluate_model(X_train, y_train, X_test, y_test, models)

{'Random Forest': 0.9591836734693877,
 'Decision Tree': 0.9387755102040817,
 'Support Vector Machine': 0.9591836734693877}

## Hyper Parameter Tunning

In [49]:
classifier = RandomForestClassifier()

params = {
    'criterion' : ['gini', 'entropy'],
    'n_estimators' : [100, 200, 300],
    'max_depth' : [3, 5, 10, None],
}

In [51]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomizedSearchCV(classifier, param_distributions = params, cv = 5, scoring = 'accuracy', verbose = 3)
clf.fit(X_train, y_train)
y_test_prediction = clf.predict(X_test)

print("Best Parameters:", clf.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=None, n_estimators=100;, score=0.974 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=None, n_estimators=100;, score=0.923 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=None, n_estimators=100;, score=0.974 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=None, n_estimators=100;, score=0.923 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=None, n_estimators=100;, score=0.923 total time=   0.1s
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.974 total time=   0.4s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.923 total time=   0.3s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=1.000 total time=   0.3s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.923 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.949 total ti