In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.model_selection import cross_validate

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

import numpy as np
import os
import pandas as pd

# Data


In [3]:
data_path = "../data"
DATA = []
for file in os.listdir(data_path):
    DATA.append(pd.read_csv(os.path.join(data_path,file)))

In [4]:
for data in DATA:
    print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           5000 non-null   object 
 1   Location       5000 non-null   object 
 2   MinTemp        4975 non-null   float64
 3   MaxTemp        4993 non-null   float64
 4   Rainfall       4955 non-null   float64
 5   Evaporation    2853 non-null   float64
 6   Sunshine       2612 non-null   float64
 7   WindGustDir    4665 non-null   object 
 8   WindGustSpeed  4667 non-null   float64
 9   WindDir9am     4608 non-null   object 
 10  WindDir3pm     4868 non-null   object 
 11  WindSpeed9am   4961 non-null   float64
 12  WindSpeed3pm   4914 non-null   float64
 13  Humidity9am    4943 non-null   float64
 14  Humidity3pm    4882 non-null   float64
 15  Pressure9am    4493 non-null   float64
 16  Pressure3pm    4496 non-null   float64
 17  Cloud9am       3103 non-null   float64
 18  Cloud3pm

# Pipeline

In [5]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="most_frequent")),
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])


col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])


In [6]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
]

pipelines = []

In [7]:
for classifier in classifiers:
    pipelines.append((type(classifier), Pipeline([("transformer", col_trans), ("model", classifier)])))

In [8]:
from sklearn.calibration import LabelEncoder


scores = []

for pipe in pipelines:
    pipe_score = []
    for data in DATA:
        score = cross_validate(pipe[1], data.iloc[:, :-1], LabelEncoder().fit_transform(data.iloc[:,-1]), cv = 5, scoring="accuracy")
        pipe_score.append(score["test_score"].mean())

    scores.append((pipe[0], pipe_score))

In [9]:
os.listdir("../data")

['weather_final.csv',
 'flights_final.csv',
 'banking_final.csv',
 'mushrooms_final.csv']

In [10]:
scores

[(sklearn.tree._classes.DecisionTreeClassifier,
  [0.7956000000000001, 0.9077999999999999, 0.8692, 0.9795999999999999]),
 (sklearn.ensemble._forest.RandomForestClassifier,
  [0.8336, 0.9428000000000001, 0.9034000000000001, 0.9996]),
 (xgboost.sklearn.XGBClassifier,
  [0.8368, 0.9469999999999998, 0.9016, 0.9974000000000001])]

# Random Searching

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

best_model = []
best_score = []

param_distributions = [
    {
        "model__max_depth": randint(1,30),
        "model__min_samples_split":randint(1,60),
        "model__criterion":["gini","entropy"],
        "model__min_samples_leaf":randint(1,60)
    },
    {
        "model__n_estimators":randint(1,2000),
        "model__min_samples_leaf":uniform(0,1),
        "model__min_samples_split":uniform(0,1),
        "model__max_features":uniform(0,1)
    }
]
#best_params_decisionTree = [[],[],[],[]]
for i,pipe in enumerate(pipelines):
    pipe_best_models = []
    pipe_best_scores = []
    for j,data in enumerate(DATA):
        #data.iloc[:, :-1],LabelEncoder().fit_transform(data.iloc[:,-1]),
        rs = RandomizedSearchCV(pipe[1], 
                                param_distributions= param_distributions[i],
                                verbose=True,
                                )
        rs.fit(data.iloc[:, :-1],LabelEncoder().fit_transform(data.iloc[:,-1]))
        pipe_best_scores.append(rs.best_score_)
        pipe_best_models.append(rs.best_estimator_)
       #best_params_decisionTree[j].append(rs.best_params_)
    best_score.append((pipe[0],pipe_best_scores))
    best_model.append((pipe[0],pipe_best_models))
        

In [19]:
#best_params_decisionTree

[[{'model__criterion': 'entropy',
   'model__max_depth': 5,
   'model__min_samples_leaf': 59,
   'model__min_samples_split': 27}],
 [{'model__criterion': 'gini',
   'model__max_depth': 14,
   'model__min_samples_leaf': 11,
   'model__min_samples_split': 43}],
 [{'model__criterion': 'entropy',
   'model__max_depth': 27,
   'model__min_samples_leaf': 1,
   'model__min_samples_split': 15}],
 [{'model__criterion': 'gini',
   'model__max_depth': 7,
   'model__min_samples_leaf': 43,
   'model__min_samples_split': 44}]]

In [15]:
best_score

[(sklearn.tree._classes.DecisionTreeClassifier,
  [0.827, 0.9206, 0.8966000000000001, 0.9663999999999999]),
 (sklearn.ensemble._forest.RandomForestClassifier,
  [0.7758, 0.836, 0.883, 0.6804])]

In [16]:
best_model

[(sklearn.tree._classes.DecisionTreeClassifier,
  [Pipeline(steps=[('transformer',
                    ColumnTransformer(transformers=[('num_pipeline',
                                                     Pipeline(steps=[('impute',
                                                                      SimpleImputer()),
                                                                     ('scale',
                                                                      MinMaxScaler())]),
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x1313cad10>),
                                                    ('cat_pipeline',
                                                     Pipeline(steps=[('impute',
                                                                      SimpleImputer(strategy='most_frequent')),
                                                                     ('one-hot',
                                  