In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder

In [2]:
data = pd.read_csv('diabetic_data.csv')

data.describe()
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
def print_unique_elements(df, columns):
    for column in columns:
        try:
            print(f"\033[94m{column}\033[0m:")
            unique_counts = df[column].value_counts()
            i = 0
            for value, count in unique_counts.items():
                i = i + 1
                print(f"        \033[91m{value}\033[0m {count}")
        except KeyError:
            print(f"    Column '{column}' not found in the DataFrame.")

In [4]:
print (data.columns)

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')


In [13]:
data['admission_type_id'].replace([5,6,8], np.nan, inplace=True)
data['discharge_disposition_id'].replace([18,25,26], np.nan, inplace=True)
data['admission_source_id'].replace([9,15,17,20, 21], np.nan, inplace=True)

In [14]:

def print_missing_counts(df, null_values):
    for col in df.select_dtypes(include=['object']).columns:
        
        for val in null_values :
            count = df[col].value_counts().get(val, 0)
            if count > 0:
                print(f"The count of {val} in the column '{col}' is: {count} equals to {((count/df.shape[0])*100):.2f}% missing values.")

    nas = df.isna().sum()
    nas = nas[nas > 0]
    nas_percentage = nas / len(df) * 100
    for column in nas.index:
        print(f"{column}: Count = {nas[column]}, Percentage = {nas_percentage[column]:.2f}%")

print_missing_counts(data, ['?', 'Unknown/Invalid'])

The count of ? in the column 'race' is: 2273 equals to 2.23% missing values.
The count of Unknown/Invalid in the column 'gender' is: 3 equals to 0.00% missing values.
The count of ? in the column 'weight' is: 98569 equals to 96.86% missing values.
The count of ? in the column 'payer_code' is: 40256 equals to 39.56% missing values.
The count of ? in the column 'medical_specialty' is: 49949 equals to 49.08% missing values.
The count of ? in the column 'diag_1' is: 21 equals to 0.02% missing values.
The count of ? in the column 'diag_2' is: 358 equals to 0.35% missing values.
The count of ? in the column 'diag_3' is: 1423 equals to 1.40% missing values.
admission_type_id: Count = 10396, Percentage = 10.22%
discharge_disposition_id: Count = 4680, Percentage = 4.60%
admission_source_id: Count = 7067, Percentage = 6.94%
max_glu_serum: Count = 96420, Percentage = 94.75%
A1Cresult: Count = 84748, Percentage = 83.28%


In [15]:
print_unique_elements(data, ['tolbutamide'])

[94mtolbutamide[0m:
        [91mNo[0m 101743
        [91mSteady[0m 23


In [85]:
no_missing = data[data['gender'] != 'Unknown/Invalid']
no_missing['gender'].unique()

array(['Female', 'Male'], dtype=object)

this fields have significant number of missing values and dosen't have high importance in data so we can ignore them .

In [86]:
no_missing = no_missing.drop(columns=['max_glu_serum', 'A1Cresult', 'payer_code', 'weight',  'medical_specialty'])

but medical_specialty has high importance in data so we imputed it despite significant number of missing values .

In [87]:
def drop_columns(df, distinct_percentage, value_proportion_percentage):
    for col in df.columns:
        if df[col].nunique() / len(df) > distinct_percentage:
            print(f'column {col} dropped cause it has more than {distinct_percentage * 100}% unique values!')
            df = df.drop(col, axis=1)

    for col in df.columns:
        if any((df[col].value_counts() / len(df)) > value_proportion_percentage):
            print(f'column {col} dropped cause it has a value by more than {value_proportion_percentage * 100}% proportion!')
            df = df.drop(col, axis=1)

    return df

no_missing = drop_columns(no_missing, 0.90, 0.99)
no_missing.info()

column encounter_id dropped cause it has more than 90.0% unique values!
column nateglinide dropped cause it has a value by more than 99.0% proportion!
column chlorpropamide dropped cause it has a value by more than 99.0% proportion!
column acetohexamide dropped cause it has a value by more than 99.0% proportion!
column tolbutamide dropped cause it has a value by more than 99.0% proportion!
column acarbose dropped cause it has a value by more than 99.0% proportion!
column miglitol dropped cause it has a value by more than 99.0% proportion!
column troglitazone dropped cause it has a value by more than 99.0% proportion!
column tolazamide dropped cause it has a value by more than 99.0% proportion!
column examide dropped cause it has a value by more than 99.0% proportion!
column citoglipton dropped cause it has a value by more than 99.0% proportion!
column glyburide-metformin dropped cause it has a value by more than 99.0% proportion!
column glipizide-metformin dropped cause it has a value 

In [88]:
no_missing = no_missing.replace('?', np.nan)

In [89]:
print_missing_counts(no_missing, ['?', 'Unknown/Invalid'])

race: Count = 2271, Percentage = 2.23%
admission_type_id: Count = 10396, Percentage = 10.22%
discharge_disposition_id: Count = 4680, Percentage = 4.60%
admission_source_id: Count = 7067, Percentage = 6.94%
diag_1: Count = 21, Percentage = 0.02%
diag_2: Count = 358, Percentage = 0.35%
diag_3: Count = 1423, Percentage = 1.40%


In [90]:
print_missing_counts(no_missing, ['?'])

race: Count = 2271, Percentage = 2.23%
admission_type_id: Count = 10396, Percentage = 10.22%
discharge_disposition_id: Count = 4680, Percentage = 4.60%
admission_source_id: Count = 7067, Percentage = 6.94%
diag_1: Count = 21, Percentage = 0.02%
diag_2: Count = 358, Percentage = 0.35%
diag_3: Count = 1423, Percentage = 1.40%


In [91]:
import gc

def label_encode(df):
    label_encoders = {}

    object_columns = df.select_dtypes(include=['object']).columns

    for column in object_columns:
        le = LabelEncoder()
        
        non_null_mask = df[column].notnull()
        non_null_values = df[column][non_null_mask]
        
        encoded_values = le.fit_transform(non_null_values)
        
        full_encoded = pd.Series(index=df[column].index, data=np.nan, dtype=float)
        
        full_encoded[non_null_mask] = encoded_values
        
        df[column] = full_encoded
        
        del full_encoded
        
        gc.collect()
        
        print(f"{column} labels: ", dict(zip(le.classes_, le.transform(le.classes_))))
        label_encoders[column] = le

    return df, label_encoders


In [92]:
no_missing , label_encoders = label_encode(no_missing)

race labels:  {'AfricanAmerican': 0, 'Asian': 1, 'Caucasian': 2, 'Hispanic': 3, 'Other': 4}
gender labels:  {'Female': 0, 'Male': 1}
age labels:  {'[0-10)': 0, '[10-20)': 1, '[20-30)': 2, '[30-40)': 3, '[40-50)': 4, '[50-60)': 5, '[60-70)': 6, '[70-80)': 7, '[80-90)': 8, '[90-100)': 9}
diag_1 labels:  {'10': 0, '11': 1, '110': 2, '112': 3, '114': 4, '115': 5, '117': 6, '131': 7, '133': 8, '135': 9, '136': 10, '141': 11, '142': 12, '143': 13, '145': 14, '146': 15, '147': 16, '148': 17, '149': 18, '150': 19, '151': 20, '152': 21, '153': 22, '154': 23, '155': 24, '156': 25, '157': 26, '158': 27, '160': 28, '161': 29, '162': 30, '163': 31, '164': 32, '170': 33, '171': 34, '172': 35, '173': 36, '174': 37, '175': 38, '179': 39, '180': 40, '182': 41, '183': 42, '184': 43, '185': 44, '187': 45, '188': 46, '189': 47, '191': 48, '192': 49, '193': 50, '194': 51, '195': 52, '196': 53, '197': 54, '198': 55, '199': 56, '200': 57, '201': 58, '202': 59, '203': 60, '204': 61, '205': 62, '207': 63, '208

In [93]:
cop = no_missing.copy()

In [94]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def impute_missing_values(df, columns_with_missing):
    for column in columns_with_missing:
        print (f'Imputing {column}')
        gc.collect()
        target_column = column
        not_null_df = df[df[target_column].notnull()]
        null_df = df[df[target_column].isnull()]

        X_train = not_null_df.drop(target_column, axis=1)
        y_train = not_null_df[target_column]
        X_test = null_df.drop(target_column, axis=1)
        
        if not X_test.empty:
            model = RandomForestClassifier()
            model.fit(X_train, y_train)

            predictions = model.predict(X_test)
            df.loc[df[target_column].isnull(), target_column] = predictions
    return df

no_missing = impute_missing_values(no_missing, ['race', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id'])
print_missing_counts(no_missing, ['?'])


Imputing race
Imputing admission_type_id
Imputing discharge_disposition_id
Imputing admission_source_id
diag_1: Count = 21, Percentage = 0.02%
diag_2: Count = 358, Percentage = 0.35%
diag_3: Count = 1423, Percentage = 1.40%


In [96]:
from sklearn.impute import KNNImputer

def impute_categorical_missing_values_knn(df, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)

    df_imputed = imputer.fit_transform(df)

    df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

    return df_imputed

no_missing = impute_categorical_missing_values_knn(no_missing , 1)
print_missing_counts(no_missing, ['?'])


In [98]:
no_missing.drop_duplicates(subset=None, keep='first', inplace=True)

In [99]:
no_missing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101763 entries, 0 to 101762
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   patient_nbr               101763 non-null  float64
 1   race                      101763 non-null  float64
 2   gender                    101763 non-null  float64
 3   age                       101763 non-null  float64
 4   admission_type_id         101763 non-null  float64
 5   discharge_disposition_id  101763 non-null  float64
 6   admission_source_id       101763 non-null  float64
 7   time_in_hospital          101763 non-null  float64
 8   num_lab_procedures        101763 non-null  float64
 9   num_procedures            101763 non-null  float64
 10  num_medications           101763 non-null  float64
 11  number_outpatient         101763 non-null  float64
 12  number_emergency          101763 non-null  float64
 13  number_inpatient          101763 non-null  f

this is an alternative way to impute the missing values for categorical features

no_missing.loc[:, 'race'] = no_missing['race'].replace('?', np.nan)

df_encoded = pd.get_dummies(no_missing)

imputer = KNNImputer(n_neighbors=3)
df_imputed = imputer.fit_transform(df_encoded)

df_imputed = pd.DataFrame(df_imputed, columns = df_encoded.columns)

no_missing.loc[:, 'race'] = df_imputed[['race_Caucasian', 'race_AfricanAmerican', 'race_Hispanic', 'race_Other', 'race_Asian']].idxmax(axis=1).str.replace('race_', '')
mode_value = no_missing['race'].mode()[0]
no_missing.loc[:, 'race'] = no_missing['race'].fillna(mode_value)
print_unique_elements(no_missing , ['race'])

now we try to detect outliers

In [124]:
from sklearn.ensemble import IsolationForest
import pandas as pd
import numpy as np

def detect_outliers(df, n_estimators, contamination, random_state):
    print (f'in outlier detection n_estimators is {n_estimators} and the contamination is {contamination}')
    df_copy = df.copy()

    iso = IsolationForest(n_estimators=n_estimators, contamination=contamination ,random_state=random_state)

    outlier_label = iso.fit_predict(df_copy)

    df_copy['outlier'] = outlier_label

    df_outliers = df_copy[df_copy['outlier'] == -1]

    df_copy = df_copy[df_copy['outlier'] == 1]
    df_copy.drop('outlier', axis=1, inplace=True)

    return df_copy, df_outliers

df_no_outliers, df_outliers = detect_outliers(no_missing, 50, 0.05,42)


in outlier detection n_estimators is 50 and the contamination is 0.05


lets normalize the data.

In [101]:
from sklearn.preprocessing import StandardScaler

def normalize_features(df):
    df_copy = df.copy()

    scaler = StandardScaler()

    df_normalized = pd.DataFrame(scaler.fit_transform(df_copy), columns=df_copy.columns)

    return df_normalized

df_normalized = normalize_features(df_no_outliers)


now lets go for dimentionality reduction step !

In [140]:
from sklearn.decomposition import PCA

def apply_pca(df, variance_ratio, random_state):
    print(f'in pca the variance ratio is {variance_ratio}')
    df_copy = df.copy()

    pca = PCA(n_components=variance_ratio, random_state=random_state)

    df_pca = pd.DataFrame(pca.fit_transform(df_copy))
    
    if (variance_ratio < 1):
        print (f'pca dimontions after pca : {df_pca.shape[1]}')
    else: 
        print(f'Total explained variance: {sum(pca.explained_variance_ratio_)}')
    
    return df_pca

variance_ratio = 0.95  
df_pca = apply_pca(df_normalized, 5, 42)


in pca the variance ratio is 5
Total explained variance: 0.314868627517442


In [118]:
num_dimensions = df_pca.shape[1]
print(f"The dimensionality of the data after PCA is: {num_dimensions}")

The dimensionality of the data after PCA is: 5


now let's implement the K-means clustering function.

In [158]:
from sklearn.cluster import KMeans

def fit_kmeans(df, n_clusters, random_state):
    print (f'in k-means the n_clusters is {n_clusters}')
    df_copy = df.copy()

    kmeans = KMeans(n_clusters=n_clusters, n_init='auto', init='k-means++', random_state=random_state)

    kmeans.fit(df_copy)

    df_copy['cluster'] = kmeans.labels_

    return df_copy, kmeans

n_clusters = 10
K_means_df , K_means_modle = fit_kmeans(df_pca , n_clusters, 42)

in k-means the n_clusters is 10


and also let's implement the DBScan clustering function

In [157]:
from sklearn.cluster import DBSCAN

def fit_dbscan(df, eps, min_samples):
    df_copy = df.copy()

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)

    dbscan.fit(df_copy)

    df_copy['cluster'] = dbscan.labels_

    return df_copy, dbscan

eps = 0.1
min_samples = 10
dbscan_df , dbscan_modle = fit_dbscan(df_pca , eps, min_samples)


now lets fine best hyper parameters

In [119]:
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest

class kmeans_pipe_hyper_parameter_tuner:
    
    def __init__(self, n_estimators, contamination, variance_ratio, n_clusters, df, random_state):
        self.n_estimators = n_estimators
        self.contamination = contamination
        self.variance_ratio = variance_ratio
        self.n_clusters = n_clusters
        self.df = df
        self.random_state=random_state
        
    def fit(self):
        df_no_outliers, df_outliers = detect_outliers(self.df, self.n_estimators, self.contamination, self.random_state)
        df_normalized = normalize_features(df_no_outliers)
        df_pca = apply_pca(df_normalized, self.variance_ratio, self.random_state)
        K_means_df , K_means_modle = fit_kmeans(df_pca , n_clusters, self.random_state)
        self.model = K_means_modle
        self.PCA_df = df_pca
        
    


In [122]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
from tqdm import tqdm

param_grid = {
    'n_estimators': [10, 25, 50],
    'contamination': [0.01, 0.05, 0.1],
    'variance_ratio': [0.9, 0.95],
    'n_clusters': [3, 5, 6, 8, 9, 10]
}

grid = ParameterGrid(param_grid)

best_score = -1
best_params = None

pbar = tqdm(total=len(grid), desc='Hyperparameter Tuning', ncols=80)

for params in grid:
    tuner = kmeans_pipe_hyper_parameter_tuner(**params, df=no_missing, random_state=42)
    tuner.fit()
    score = silhouette_score(tuner.PCA_df, tuner.model.labels_)
    
    print (f'for params {params} got the score of : {score}')
    
    if score > best_score:
        best_score = score
        best_params = params

    pbar.update()

pbar.close()

print(f"Best score: {best_score}")
print(f"Best parameters: {best_params}")

Hyperparameter Tuning:   0%|                            | 0/108 [00:09<?, ?it/s]
Hyperparameter Tuning:   1%|▏                | 1/108 [02:19<4:08:58, 139.62s/it]

for params {'contamination': 0.01, 'n_clusters': 3, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03759970116863179


Hyperparameter Tuning:   2%|▎                | 2/108 [04:39<4:06:48, 139.70s/it]

for params {'contamination': 0.01, 'n_clusters': 3, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03759970116863179


Hyperparameter Tuning:   3%|▍                | 3/108 [07:02<4:07:06, 141.21s/it]

for params {'contamination': 0.01, 'n_clusters': 3, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.026418468769621666


Hyperparameter Tuning:   4%|▋                | 4/108 [09:21<4:03:21, 140.39s/it]

for params {'contamination': 0.01, 'n_clusters': 3, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.026418468769621666


Hyperparameter Tuning:   5%|▊                | 5/108 [11:42<4:01:16, 140.54s/it]

for params {'contamination': 0.01, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.02497604306045994


Hyperparameter Tuning:   6%|▉                | 6/108 [14:02<3:58:42, 140.42s/it]

for params {'contamination': 0.01, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.02497604306045994


Hyperparameter Tuning:   6%|█                | 7/108 [16:20<3:55:17, 139.78s/it]

for params {'contamination': 0.01, 'n_clusters': 5, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03759970116863179


Hyperparameter Tuning:   7%|█▎               | 8/108 [18:39<3:52:20, 139.41s/it]

for params {'contamination': 0.01, 'n_clusters': 5, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03759970116863179


Hyperparameter Tuning:   8%|█▍               | 9/108 [20:58<3:49:52, 139.32s/it]

for params {'contamination': 0.01, 'n_clusters': 5, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.026418468769621666


Hyperparameter Tuning:   9%|█▍              | 10/108 [23:17<3:47:27, 139.26s/it]

for params {'contamination': 0.01, 'n_clusters': 5, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.026418468769621666


Hyperparameter Tuning:  10%|█▋              | 11/108 [25:38<3:45:45, 139.65s/it]

for params {'contamination': 0.01, 'n_clusters': 5, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.02497604306045994


Hyperparameter Tuning:  11%|█▊              | 12/108 [27:58<3:43:41, 139.81s/it]

for params {'contamination': 0.01, 'n_clusters': 5, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.02497604306045994


Hyperparameter Tuning:  12%|█▉              | 13/108 [30:16<3:40:36, 139.33s/it]

for params {'contamination': 0.01, 'n_clusters': 6, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03759970116863179


Hyperparameter Tuning:  13%|██              | 14/108 [32:35<3:37:51, 139.06s/it]

for params {'contamination': 0.01, 'n_clusters': 6, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03759970116863179


Hyperparameter Tuning:  14%|██▏             | 15/108 [34:54<3:35:42, 139.16s/it]

for params {'contamination': 0.01, 'n_clusters': 6, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.026418468769621666


Hyperparameter Tuning:  15%|██▎             | 16/108 [37:14<3:33:35, 139.30s/it]

for params {'contamination': 0.01, 'n_clusters': 6, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.026418468769621666


Hyperparameter Tuning:  16%|██▌             | 17/108 [39:34<3:31:41, 139.57s/it]

for params {'contamination': 0.01, 'n_clusters': 6, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.02497604306045994


Hyperparameter Tuning:  17%|██▋             | 18/108 [41:55<3:29:59, 139.99s/it]

for params {'contamination': 0.01, 'n_clusters': 6, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.02497604306045994


Hyperparameter Tuning:  18%|██▊             | 19/108 [44:14<3:27:03, 139.59s/it]

for params {'contamination': 0.01, 'n_clusters': 8, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03759970116863179


Hyperparameter Tuning:  19%|██▉             | 20/108 [46:32<3:24:18, 139.30s/it]

for params {'contamination': 0.01, 'n_clusters': 8, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03759970116863179


Hyperparameter Tuning:  19%|███             | 21/108 [48:55<3:23:27, 140.31s/it]

for params {'contamination': 0.01, 'n_clusters': 8, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.026418468769621666


Hyperparameter Tuning:  20%|███▎            | 22/108 [51:14<3:20:38, 139.98s/it]

for params {'contamination': 0.01, 'n_clusters': 8, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.026418468769621666


Hyperparameter Tuning:  21%|███▍            | 23/108 [53:34<3:18:27, 140.09s/it]

for params {'contamination': 0.01, 'n_clusters': 8, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.02497604306045994


Hyperparameter Tuning:  22%|███▌            | 24/108 [55:55<3:16:21, 140.26s/it]

for params {'contamination': 0.01, 'n_clusters': 8, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.02497604306045994


Hyperparameter Tuning:  23%|███▋            | 25/108 [58:14<3:13:22, 139.79s/it]

for params {'contamination': 0.01, 'n_clusters': 9, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03759970116863179


Hyperparameter Tuning:  24%|███▎          | 26/108 [1:00:32<3:10:31, 139.41s/it]

for params {'contamination': 0.01, 'n_clusters': 9, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03759970116863179


Hyperparameter Tuning:  25%|███▌          | 27/108 [1:02:51<3:08:05, 139.33s/it]

for params {'contamination': 0.01, 'n_clusters': 9, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.026418468769621666


Hyperparameter Tuning:  26%|███▋          | 28/108 [1:05:11<3:05:52, 139.40s/it]

for params {'contamination': 0.01, 'n_clusters': 9, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.026418468769621666


Hyperparameter Tuning:  27%|███▊          | 29/108 [1:07:33<3:04:35, 140.20s/it]

for params {'contamination': 0.01, 'n_clusters': 9, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.02497604306045994


Hyperparameter Tuning:  28%|███▉          | 30/108 [1:09:53<3:02:17, 140.22s/it]

for params {'contamination': 0.01, 'n_clusters': 9, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.02497604306045994


Hyperparameter Tuning:  29%|████          | 31/108 [1:12:12<2:59:24, 139.80s/it]

for params {'contamination': 0.01, 'n_clusters': 10, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03759970116863179


Hyperparameter Tuning:  30%|████▏         | 32/108 [1:14:31<2:56:36, 139.42s/it]

for params {'contamination': 0.01, 'n_clusters': 10, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03759970116863179


Hyperparameter Tuning:  31%|████▎         | 33/108 [1:16:50<2:54:12, 139.36s/it]

for params {'contamination': 0.01, 'n_clusters': 10, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.026418468769621666


Hyperparameter Tuning:  31%|████▍         | 34/108 [1:19:09<2:51:56, 139.41s/it]

for params {'contamination': 0.01, 'n_clusters': 10, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.026418468769621666


Hyperparameter Tuning:  32%|████▌         | 35/108 [1:21:30<2:49:57, 139.69s/it]

for params {'contamination': 0.01, 'n_clusters': 10, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.02497604306045994


Hyperparameter Tuning:  33%|████▋         | 36/108 [1:23:51<2:48:03, 140.05s/it]

for params {'contamination': 0.01, 'n_clusters': 10, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.02497604306045994


Hyperparameter Tuning:  34%|████▊         | 37/108 [1:26:00<2:41:53, 136.81s/it]

for params {'contamination': 0.05, 'n_clusters': 3, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.02413556449601564


Hyperparameter Tuning:  35%|████▉         | 38/108 [1:28:09<2:36:55, 134.51s/it]

for params {'contamination': 0.05, 'n_clusters': 3, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.02413556449601564


Hyperparameter Tuning:  36%|█████         | 39/108 [1:30:18<2:32:44, 132.82s/it]

for params {'contamination': 0.05, 'n_clusters': 3, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.044316143284190826


Hyperparameter Tuning:  37%|█████▏        | 40/108 [1:32:27<2:29:13, 131.66s/it]

for params {'contamination': 0.05, 'n_clusters': 3, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.044316143284190826


Hyperparameter Tuning:  38%|█████▎        | 41/108 [1:34:36<2:26:15, 130.98s/it]

for params {'contamination': 0.05, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.0428097932666039


Hyperparameter Tuning:  39%|█████▍        | 42/108 [1:36:46<2:23:40, 130.61s/it]

for params {'contamination': 0.05, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.0428097932666039


Hyperparameter Tuning:  40%|█████▌        | 43/108 [1:38:55<2:21:01, 130.18s/it]

for params {'contamination': 0.05, 'n_clusters': 5, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.02413556449601564


Hyperparameter Tuning:  41%|█████▋        | 44/108 [1:41:04<2:18:28, 129.82s/it]

for params {'contamination': 0.05, 'n_clusters': 5, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.02413556449601564


Hyperparameter Tuning:  42%|█████▊        | 45/108 [1:43:13<2:16:02, 129.57s/it]

for params {'contamination': 0.05, 'n_clusters': 5, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.044316143284190826


Hyperparameter Tuning:  43%|█████▉        | 46/108 [1:45:22<2:13:40, 129.37s/it]

for params {'contamination': 0.05, 'n_clusters': 5, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.044316143284190826


Hyperparameter Tuning:  44%|██████        | 47/108 [1:47:31<2:11:31, 129.37s/it]

for params {'contamination': 0.05, 'n_clusters': 5, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.0428097932666039


Hyperparameter Tuning:  44%|██████▏       | 48/108 [1:49:41<2:09:25, 129.42s/it]

for params {'contamination': 0.05, 'n_clusters': 5, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.0428097932666039


Hyperparameter Tuning:  45%|██████▎       | 49/108 [1:51:50<2:07:08, 129.30s/it]

for params {'contamination': 0.05, 'n_clusters': 6, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.02413556449601564


Hyperparameter Tuning:  46%|██████▍       | 50/108 [1:53:59<2:04:51, 129.17s/it]

for params {'contamination': 0.05, 'n_clusters': 6, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.02413556449601564


Hyperparameter Tuning:  47%|██████▌       | 51/108 [1:56:08<2:02:43, 129.19s/it]

for params {'contamination': 0.05, 'n_clusters': 6, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.044316143284190826


Hyperparameter Tuning:  48%|██████▋       | 52/108 [1:58:17<2:00:31, 129.13s/it]

for params {'contamination': 0.05, 'n_clusters': 6, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.044316143284190826


Hyperparameter Tuning:  49%|██████▊       | 53/108 [2:00:26<1:58:23, 129.16s/it]

for params {'contamination': 0.05, 'n_clusters': 6, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.0428097932666039


Hyperparameter Tuning:  50%|███████       | 54/108 [2:02:36<1:56:16, 129.19s/it]

for params {'contamination': 0.05, 'n_clusters': 6, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.0428097932666039


Hyperparameter Tuning:  51%|███████▏      | 55/108 [2:04:45<1:54:09, 129.23s/it]

for params {'contamination': 0.05, 'n_clusters': 8, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.02413556449601564


Hyperparameter Tuning:  52%|███████▎      | 56/108 [2:06:55<1:52:18, 129.59s/it]

for params {'contamination': 0.05, 'n_clusters': 8, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.02413556449601564


Hyperparameter Tuning:  53%|███████▍      | 57/108 [2:09:04<1:49:57, 129.37s/it]

for params {'contamination': 0.05, 'n_clusters': 8, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.044316143284190826


Hyperparameter Tuning:  54%|███████▌      | 58/108 [2:11:14<1:47:47, 129.36s/it]

for params {'contamination': 0.05, 'n_clusters': 8, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.044316143284190826


Hyperparameter Tuning:  55%|███████▋      | 59/108 [2:13:23<1:45:38, 129.36s/it]

for params {'contamination': 0.05, 'n_clusters': 8, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.0428097932666039


Hyperparameter Tuning:  56%|███████▊      | 60/108 [2:15:32<1:43:28, 129.35s/it]

for params {'contamination': 0.05, 'n_clusters': 8, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.0428097932666039


Hyperparameter Tuning:  56%|███████▉      | 61/108 [2:17:41<1:41:12, 129.19s/it]

for params {'contamination': 0.05, 'n_clusters': 9, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.02413556449601564


Hyperparameter Tuning:  57%|████████      | 62/108 [2:19:50<1:38:58, 129.11s/it]

for params {'contamination': 0.05, 'n_clusters': 9, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.02413556449601564


Hyperparameter Tuning:  58%|████████▏     | 63/108 [2:21:59<1:36:47, 129.06s/it]

for params {'contamination': 0.05, 'n_clusters': 9, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.044316143284190826


Hyperparameter Tuning:  59%|████████▎     | 64/108 [2:24:08<1:34:42, 129.15s/it]

for params {'contamination': 0.05, 'n_clusters': 9, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.044316143284190826


Hyperparameter Tuning:  60%|████████▍     | 65/108 [2:26:28<1:34:52, 132.38s/it]

for params {'contamination': 0.05, 'n_clusters': 9, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.0428097932666039


Hyperparameter Tuning:  61%|████████▌     | 66/108 [2:28:47<1:34:02, 134.35s/it]

for params {'contamination': 0.05, 'n_clusters': 9, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.0428097932666039


Hyperparameter Tuning:  62%|████████▋     | 67/108 [2:31:12<1:33:54, 137.43s/it]

for params {'contamination': 0.05, 'n_clusters': 10, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.02413556449601564


Hyperparameter Tuning:  63%|████████▊     | 68/108 [2:33:37<1:33:06, 139.66s/it]

for params {'contamination': 0.05, 'n_clusters': 10, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.02413556449601564


Hyperparameter Tuning:  64%|████████▉     | 69/108 [2:35:57<1:30:54, 139.85s/it]

for params {'contamination': 0.05, 'n_clusters': 10, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.044316143284190826


Hyperparameter Tuning:  65%|█████████     | 70/108 [2:38:13<1:27:46, 138.60s/it]

for params {'contamination': 0.05, 'n_clusters': 10, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.044316143284190826


Hyperparameter Tuning:  66%|█████████▏    | 71/108 [2:40:28<1:24:55, 137.72s/it]

for params {'contamination': 0.05, 'n_clusters': 10, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.0428097932666039


Hyperparameter Tuning:  67%|█████████▎    | 72/108 [2:42:44<1:22:20, 137.24s/it]

for params {'contamination': 0.05, 'n_clusters': 10, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.0428097932666039


Hyperparameter Tuning:  68%|█████████▍    | 73/108 [2:44:44<1:16:56, 131.89s/it]

for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03848485324111219


Hyperparameter Tuning:  69%|█████████▌    | 74/108 [2:46:49<1:13:39, 129.99s/it]

for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03848485324111219


Hyperparameter Tuning:  69%|█████████▋    | 75/108 [2:49:00<1:11:33, 130.12s/it]

for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.04277768598735495


Hyperparameter Tuning:  70%|█████████▊    | 76/108 [2:51:05<1:08:36, 128.65s/it]

for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.04277768598735495


Hyperparameter Tuning:  71%|█████████▉    | 77/108 [2:53:12<1:06:16, 128.26s/it]

for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.057534251634545786


Hyperparameter Tuning:  72%|██████████    | 78/108 [2:55:28<1:05:18, 130.63s/it]

for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.057534251634545786


Hyperparameter Tuning:  73%|██████████▏   | 79/108 [2:57:37<1:02:54, 130.14s/it]

for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03848485324111219


Hyperparameter Tuning:  74%|███████████▊    | 80/108 [2:59:39<59:29, 127.50s/it]

for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03848485324111219


Hyperparameter Tuning:  75%|████████████    | 81/108 [3:01:36<55:58, 124.39s/it]

for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.04277768598735495


Hyperparameter Tuning:  76%|████████████▏   | 82/108 [3:03:29<52:27, 121.06s/it]

for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.04277768598735495


Hyperparameter Tuning:  77%|████████████▎   | 83/108 [3:05:22<49:21, 118.47s/it]

for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.057534251634545786


Hyperparameter Tuning:  78%|████████████▍   | 84/108 [3:07:16<46:50, 117.11s/it]

for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.057534251634545786


Hyperparameter Tuning:  79%|████████████▌   | 85/108 [3:09:06<44:08, 115.17s/it]

for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03848485324111219


Hyperparameter Tuning:  80%|████████████▋   | 86/108 [3:10:57<41:46, 113.94s/it]

for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03848485324111219


Hyperparameter Tuning:  81%|████████████▉   | 87/108 [3:12:49<39:41, 113.40s/it]

for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.04277768598735495


Hyperparameter Tuning:  81%|█████████████   | 88/108 [3:14:41<37:34, 112.71s/it]

for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.04277768598735495


Hyperparameter Tuning:  82%|█████████████▏  | 89/108 [3:16:33<35:40, 112.64s/it]

for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.057534251634545786


Hyperparameter Tuning:  83%|█████████████▎  | 90/108 [3:18:26<33:47, 112.65s/it]

for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.057534251634545786


Hyperparameter Tuning:  84%|█████████████▍  | 91/108 [3:20:16<31:45, 112.08s/it]

for params {'contamination': 0.1, 'n_clusters': 8, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03848485324111219


Hyperparameter Tuning:  85%|█████████████▋  | 92/108 [3:22:07<29:47, 111.69s/it]

for params {'contamination': 0.1, 'n_clusters': 8, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03848485324111219


Hyperparameter Tuning:  86%|█████████████▊  | 93/108 [3:23:59<27:55, 111.72s/it]

for params {'contamination': 0.1, 'n_clusters': 8, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.04277768598735495


Hyperparameter Tuning:  87%|█████████████▉  | 94/108 [3:25:50<26:02, 111.58s/it]

for params {'contamination': 0.1, 'n_clusters': 8, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.04277768598735495


Hyperparameter Tuning:  88%|██████████████  | 95/108 [3:27:43<24:15, 111.97s/it]

for params {'contamination': 0.1, 'n_clusters': 8, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.057534251634545786


Hyperparameter Tuning:  89%|██████████████▏ | 96/108 [3:29:35<22:24, 112.03s/it]

for params {'contamination': 0.1, 'n_clusters': 8, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.057534251634545786


Hyperparameter Tuning:  90%|██████████████▎ | 97/108 [3:31:26<20:28, 111.65s/it]

for params {'contamination': 0.1, 'n_clusters': 9, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03848485324111219


Hyperparameter Tuning:  91%|██████████████▌ | 98/108 [3:33:17<18:33, 111.36s/it]

for params {'contamination': 0.1, 'n_clusters': 9, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03848485324111219


Hyperparameter Tuning:  92%|██████████████▋ | 99/108 [3:35:09<16:43, 111.51s/it]

for params {'contamination': 0.1, 'n_clusters': 9, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.04277768598735495


Hyperparameter Tuning:  93%|█████████████▉ | 100/108 [3:37:00<14:52, 111.52s/it]

for params {'contamination': 0.1, 'n_clusters': 9, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.04277768598735495


Hyperparameter Tuning:  94%|██████████████ | 101/108 [3:38:53<13:02, 111.83s/it]

for params {'contamination': 0.1, 'n_clusters': 9, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.057534251634545786


Hyperparameter Tuning:  94%|██████████████▏| 102/108 [3:40:45<11:12, 112.02s/it]

for params {'contamination': 0.1, 'n_clusters': 9, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.057534251634545786


Hyperparameter Tuning:  95%|██████████████▎| 103/108 [3:42:36<09:18, 111.71s/it]

for params {'contamination': 0.1, 'n_clusters': 10, 'n_estimators': 10, 'variance_ratio': 0.9} got the score of : 0.03848485324111219


Hyperparameter Tuning:  96%|██████████████▍| 104/108 [3:44:27<07:25, 111.45s/it]

for params {'contamination': 0.1, 'n_clusters': 10, 'n_estimators': 10, 'variance_ratio': 0.95} got the score of : 0.03848485324111219


Hyperparameter Tuning:  97%|██████████████▌| 105/108 [3:46:19<05:34, 111.53s/it]

for params {'contamination': 0.1, 'n_clusters': 10, 'n_estimators': 25, 'variance_ratio': 0.9} got the score of : 0.04277768598735495


Hyperparameter Tuning:  98%|██████████████▋| 106/108 [3:48:11<03:43, 111.67s/it]

for params {'contamination': 0.1, 'n_clusters': 10, 'n_estimators': 25, 'variance_ratio': 0.95} got the score of : 0.04277768598735495


Hyperparameter Tuning:  99%|██████████████▊| 107/108 [3:50:03<01:51, 111.91s/it]

for params {'contamination': 0.1, 'n_clusters': 10, 'n_estimators': 50, 'variance_ratio': 0.9} got the score of : 0.057534251634545786


Hyperparameter Tuning: 100%|███████████████| 108/108 [3:51:56<00:00, 128.86s/it]

for params {'contamination': 0.1, 'n_clusters': 10, 'n_estimators': 50, 'variance_ratio': 0.95} got the score of : 0.057534251634545786
Best score: 0.057534251634545786
Best parameters: {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 0.9}





In [123]:
tuner = kmeans_pipe_hyper_parameter_tuner(50, 0.1,0.9,3 , df=no_missing, random_state=42)
tuner.fit()
best_k_means_model = tuner.model

now lets tune dbscan hyper parameters

In [133]:
class dbscan_pipe_hyper_parameter_tuner:
    
    def __init__(self, n_estimators, contamination, variance_ratio, eps, mean_samples, df, random_state):
        self.n_estimators = n_estimators
        self.contamination = contamination
        self.variance_ratio = variance_ratio
        self.eps = eps
        self.mean_samples = mean_samples
        self.df = df
        self.random_state=random_state
        
    def fit(self):
        df_no_outliers, df_outliers = detect_outliers(self.df, self.n_estimators, self.contamination, self.random_state)
        df_normalized = normalize_features(df_no_outliers)
        df_pca = apply_pca(df_normalized, self.variance_ratio, self.random_state)
        dbscan_df , dbscan_model = fit_dbscan(df_pca , self.eps, self.mean_samples)
        self.model = dbscan_model
        self.PCA_df = df_pca

In [None]:
param_grid = {
    'n_estimators': [25, 50],
    'contamination': [0.05, 0.1],
    'variance_ratio': [0.9, 0.95],
    'eps': [0.2, 0.3, 0.4],
    'mean_samples' : [10, 15, 20]
}

grid = ParameterGrid(param_grid)

best_score = -1
best_params = None

pbar = tqdm(total=len(grid), desc='Hyperparameter Tuning', ncols=80)

for params in grid:
    tuner = dbscan_pipe_hyper_parameter_tuner(**params, df=no_missing, random_state=42)
    tuner.fit()
    score = silhouette_score(tuner.PCA_df, tuner.model.labels_)
    
    print (f'for params {params} got the score of : {score}')
    
    if score > best_score:
        best_score = score
        best_params = params

    pbar.update()

pbar.close()

print(f"Best score: {best_score}")
print(f"Best parameters: {best_params}")

In [172]:
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_samples
import numpy as np

class kmeans_pipe_hyper_parameter_tuner:
    
    def __init__(self, n_estimators, contamination, variance_ratio, n_clusters, df, random_state):
        self.n_estimators = n_estimators
        self.contamination = contamination
        self.variance_ratio = variance_ratio
        self.n_clusters = n_clusters
        self.df = df
        self.random_state=random_state
        
    def fit(self):
        df_no_outliers, df_outliers = detect_outliers(self.df, self.n_estimators, self.contamination, self.random_state)
        df_normalized = normalize_features(df_no_outliers)
        df_pca = apply_pca(df_normalized, variance_ratio = self.variance_ratio, random_state = self.random_state)
        K_means_df , K_means_modle = fit_kmeans(df_pca , n_clusters = self.n_clusters, random_state = self.random_state)
        self.model = K_means_modle
        self.PCA_df = df_pca
        self.labels_ = self.model.labels_

    def sample_silhouette_score(self, sample_percentage):
        unique_labels = np.unique(self.labels_)
        sample_indices = []

        for label in unique_labels:
            cluster_indices = np.where(self.labels_ == label)[0]
            sample_size = int(len(cluster_indices) * sample_percentage)
            if cluster_indices.size > 0:
                sample_indices.extend(np.random.choice(cluster_indices, size=min(sample_size, cluster_indices.size), replace=False))

        sample_labels = self.labels_[sample_indices]
        sample_data = self.PCA_df.reset_index(drop=True).iloc[sample_indices]

        return silhouette_score(sample_data, sample_labels)

    def cv_silhouette_score(self, sample_percentage, cv):
        scores = []
        for _ in range(cv):
            score = self.sample_silhouette_score(sample_percentage)
            scores.append(score)
        return np.mean(scores)

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
from tqdm import tqdm

param_grid = {
    'n_estimators': [50,100],
    'contamination': [0.1, 0.15, 0.20],
    'variance_ratio': [3, 5],
    'n_clusters': [3, 4, 5, 6]
}

grid = ParameterGrid(param_grid)

best_score = -1
best_params = None

pbar = tqdm(total=len(grid), desc='Hyperparameter Tuning', ncols=80)

for params in grid:
    tuner = kmeans_pipe_hyper_parameter_tuner(**params, df=no_missing, random_state=1)
    tuner.fit()
    score = tuner.cv_silhouette_score(sample_percentage=0.1, cv=3)  # Adjust sample percentage and cv as needed

    print (f'for params {params} got the score of : {score}')

    if score > best_score:
        best_score = score
        best_params = params

    pbar.update()

pbar.close()

print(f"Best score: {best_score}")
print(f"Best parameters: {best_params}")


Hyperparameter Tuning:  38%|███████▌            | 18/48 [02:14<03:43,  7.46s/it]


in outlier detection n_estimators is 50 and the contamination is 0.1
in pca the variance ratio is 3
Total explained variance: 0.9999999999850945
in k-means the n_clusters is 3




for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 3} got the score of : 0.273004092007024
in outlier detection n_estimators is 50 and the contamination is 0.1
in pca the variance ratio is 5
Total explained variance: 0.9999999999999462
in k-means the n_clusters is 3




for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 5} got the score of : 0.16071018048520405
in outlier detection n_estimators is 100 and the contamination is 0.1
in pca the variance ratio is 3
Total explained variance: 0.9999999999852016
in k-means the n_clusters is 3




for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.2589098171806015
in outlier detection n_estimators is 100 and the contamination is 0.1
in pca the variance ratio is 5
Total explained variance: 0.9999999999999438
in k-means the n_clusters is 3




for params {'contamination': 0.1, 'n_clusters': 3, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.17124345212367556
in outlier detection n_estimators is 50 and the contamination is 0.1
in pca the variance ratio is 3
Total explained variance: 0.9999999999850945
in k-means the n_clusters is 4




for params {'contamination': 0.1, 'n_clusters': 4, 'n_estimators': 50, 'variance_ratio': 3} got the score of : 0.3016934227195018
in outlier detection n_estimators is 50 and the contamination is 0.1
in pca the variance ratio is 5
Total explained variance: 0.9999999999999462
in k-means the n_clusters is 4




for params {'contamination': 0.1, 'n_clusters': 4, 'n_estimators': 50, 'variance_ratio': 5} got the score of : 0.1689294695039616
in outlier detection n_estimators is 100 and the contamination is 0.1
in pca the variance ratio is 3
Total explained variance: 0.9999999999852016
in k-means the n_clusters is 4




for params {'contamination': 0.1, 'n_clusters': 4, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.3042435942000006
in outlier detection n_estimators is 100 and the contamination is 0.1
in pca the variance ratio is 5
Total explained variance: 0.9999999999999438
in k-means the n_clusters is 4




for params {'contamination': 0.1, 'n_clusters': 4, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.16849893823962978
in outlier detection n_estimators is 50 and the contamination is 0.1
in pca the variance ratio is 3
Total explained variance: 0.9999999999850945
in k-means the n_clusters is 5




for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 50, 'variance_ratio': 3} got the score of : 0.3017918716514542
in outlier detection n_estimators is 50 and the contamination is 0.1
in pca the variance ratio is 5
Total explained variance: 0.9999999999999462
in k-means the n_clusters is 5




for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 50, 'variance_ratio': 5} got the score of : 0.1700567425825276
in outlier detection n_estimators is 100 and the contamination is 0.1
in pca the variance ratio is 3
Total explained variance: 0.9999999999852016
in k-means the n_clusters is 5




for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.27946691671647556
in outlier detection n_estimators is 100 and the contamination is 0.1
in pca the variance ratio is 5
Total explained variance: 0.9999999999999438
in k-means the n_clusters is 5




for params {'contamination': 0.1, 'n_clusters': 5, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.1727978721545422
in outlier detection n_estimators is 50 and the contamination is 0.1
in pca the variance ratio is 3
Total explained variance: 0.9999999999850945
in k-means the n_clusters is 6




for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 50, 'variance_ratio': 3} got the score of : 0.2723681557401814
in outlier detection n_estimators is 50 and the contamination is 0.1
in pca the variance ratio is 5
Total explained variance: 0.9999999999999462
in k-means the n_clusters is 6




for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 50, 'variance_ratio': 5} got the score of : 0.17656635532971743
in outlier detection n_estimators is 100 and the contamination is 0.1
in pca the variance ratio is 3
Total explained variance: 0.9999999999852016
in k-means the n_clusters is 6




for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.2855173693091499
in outlier detection n_estimators is 100 and the contamination is 0.1
in pca the variance ratio is 5
Total explained variance: 0.9999999999999438
in k-means the n_clusters is 6




for params {'contamination': 0.1, 'n_clusters': 6, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.1749376634502625
in outlier detection n_estimators is 50 and the contamination is 0.15
in pca the variance ratio is 3
Total explained variance: 0.9999999999853221
in k-means the n_clusters is 3




for params {'contamination': 0.15, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 3} got the score of : 0.2736586177983071
in outlier detection n_estimators is 50 and the contamination is 0.15
in pca the variance ratio is 5
Total explained variance: 0.9999999999999452
in k-means the n_clusters is 3




for params {'contamination': 0.15, 'n_clusters': 3, 'n_estimators': 50, 'variance_ratio': 5} got the score of : 0.14259322566343147
in outlier detection n_estimators is 100 and the contamination is 0.15
in pca the variance ratio is 3
Total explained variance: 0.9999999999854868
in k-means the n_clusters is 3




for params {'contamination': 0.15, 'n_clusters': 3, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.29075773250693954
in outlier detection n_estimators is 100 and the contamination is 0.15


KeyboardInterrupt: 

In [156]:
gc.collect()
tuner = kmeans_pipe_hyper_parameter_tuner(contamination=0.2, n_clusters=4, n_estimators= 100, variance_ratio=3, df=no_missing, random_state=1)
tuner.fit()
score = silhouette_score(tuner.PCA_df, tuner.model.labels_)
print (score)

in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714
in k-means the n_clusters is 4
0.2839700722573416


In [132]:
print (tuner.PCA_df.shape)

(81410, 26)


In [169]:
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_samples
import numpy as np

class dbscan_pipe_hyper_parameter_tuner:
    
    def __init__(self, n_estimators, contamination, variance_ratio, eps , min_samples, df, random_state):
        self.n_estimators = n_estimators
        self.contamination = contamination
        self.variance_ratio = variance_ratio
        self.eps = eps
        self.df = df
        self.random_state=random_state
        self.min_samples = min_samples
        
    def fit(self):
        df_no_outliers, df_outliers = detect_outliers(self.df, self.n_estimators, self.contamination, self.random_state)
        df_normalized = normalize_features(df_no_outliers)
        df_pca = apply_pca(df_normalized, variance_ratio = self.variance_ratio, random_state = self.random_state)
        K_means_df , K_means_modle = fit_dbscan(eps= self.eps, min_samples= self.min_samples, df = df_pca)
        self.model = K_means_modle
        self.PCA_df = df_pca
        self.labels_ = self.model.labels_

    def sample_silhouette_score(self, sample_percentage):
        unique_labels = np.unique(self.labels_)
        if len(unique_labels) < 2:
            return -1

        sample_indices = []
        for label in unique_labels:
            cluster_indices = np.where(self.labels_ == label)[0]
            sample_size = int(len(cluster_indices) * sample_percentage)
            if cluster_indices.size > 0:
                sample_indices.extend(np.random.choice(cluster_indices, size=min(sample_size, cluster_indices.size), replace=False))

        sample_labels = self.labels_[sample_indices]
        sample_data = self.PCA_df.reset_index(drop=True).iloc[sample_indices]

        return silhouette_score(sample_data, sample_labels)

    def cv_silhouette_score(self, sample_percentage, cv):
        scores = []
        for _ in range(cv):
            score = self.sample_silhouette_score(sample_percentage)
            scores.append(score)
        return np.mean(scores)

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
from tqdm import tqdm

param_grid = {
    'n_estimators': [100],
    'contamination': [0.20],
    'variance_ratio': [3, 5],
    'eps': [0.6, 0.7, 0.8, 0.9],
    'min_samples': [5, 10, 15, 20, 25]
}

grid = ParameterGrid(param_grid)

best_score = -1
best_params = None

pbar = tqdm(total=len(grid), desc='Hyperparameter Tuning', ncols=80)

for params in grid:
    tuner = dbscan_pipe_hyper_parameter_tuner(**params, df=no_missing, random_state=1)
    tuner.fit()
    score = tuner.cv_silhouette_score(sample_percentage=0.1, cv=3)  # Adjust sample percentage and cv as needed

    print (f'for params {params} got the score of : {score}')

    if score > best_score:
        best_score = score
        best_params = params

    pbar.update()

pbar.close()

print(f"Best score: {best_score}")
print(f"Best parameters: {best_params}")


Hyperparameter Tuning:   0%|                             | 0/40 [00:00<?, ?it/s]

in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:   2%|▌                    | 1/40 [00:14<09:29, 14.60s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 5, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.502369925505771
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:   5%|█                    | 2/40 [00:26<08:22, 13.23s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 5, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.04072724439792431
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:   8%|█▌                   | 3/40 [00:40<08:09, 13.24s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 10, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.4098299120326645
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  10%|██                   | 4/40 [00:52<07:49, 13.05s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 10, 'n_estimators': 100, 'variance_ratio': 5} got the score of : -0.1254590916520769
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  12%|██▋                  | 5/40 [01:05<07:37, 13.07s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 15, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.40054761444783255
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  15%|███▏                 | 6/40 [01:17<07:11, 12.69s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 15, 'n_estimators': 100, 'variance_ratio': 5} got the score of : -0.09299426962964115
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  18%|███▋                 | 7/40 [01:32<07:16, 13.21s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 20, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.3944708682112128
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  20%|████▏                | 8/40 [01:43<06:43, 12.62s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 20, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.006252123156307334
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  22%|████▋                | 9/40 [01:55<06:27, 12.51s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 25, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.38625776899242287
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  25%|█████               | 10/40 [02:07<06:10, 12.34s/it]

for params {'contamination': 0.2, 'eps': 0.6, 'min_samples': 25, 'n_estimators': 100, 'variance_ratio': 5} got the score of : -0.1350772354351658
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  28%|█████▌              | 11/40 [02:22<06:15, 12.96s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 5, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.4343153501292452
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  30%|██████              | 12/40 [02:35<06:09, 13.21s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 5, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.09196484001298207
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  32%|██████▌             | 13/40 [02:50<06:10, 13.71s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 10, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.4555785776792069
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  35%|███████             | 14/40 [03:03<05:50, 13.47s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 10, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.2369726372018819
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  38%|███████▌            | 15/40 [03:18<05:43, 13.76s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 15, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.3916100301977034
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  40%|████████            | 16/40 [03:31<05:24, 13.52s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 15, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.16467526012573785
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  42%|████████▌           | 17/40 [03:45<05:18, 13.85s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 20, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.41797265339782635
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  45%|█████████           | 18/40 [03:59<05:01, 13.71s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 20, 'n_estimators': 100, 'variance_ratio': 5} got the score of : -0.006151850502148566
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  48%|█████████▌          | 19/40 [04:15<05:02, 14.41s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 25, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.392101500890945
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  50%|██████████          | 20/40 [04:30<04:54, 14.74s/it]

for params {'contamination': 0.2, 'eps': 0.7, 'min_samples': 25, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.13432665726011161
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  52%|██████████▌         | 21/40 [04:48<04:59, 15.74s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 5, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.4004509006054031
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  55%|███████████         | 22/40 [05:04<04:42, 15.70s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 5, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.3056434528508012
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  57%|███████████▌        | 23/40 [05:23<04:42, 16.63s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 10, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.41164883075301634
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  60%|████████████        | 24/40 [05:37<04:17, 16.08s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 10, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.28510102737392273
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  62%|████████████▌       | 25/40 [05:55<04:08, 16.55s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 15, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.470391136315433
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  65%|█████████████       | 26/40 [06:10<03:46, 16.16s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 15, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.2657004678172216
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  68%|█████████████▌      | 27/40 [06:28<03:36, 16.65s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 20, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.48840663593523875
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  70%|██████████████      | 28/40 [06:45<03:20, 16.69s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 20, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.2453241481011185
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  72%|██████████████▌     | 29/40 [07:04<03:11, 17.40s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 25, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.44518061469764053
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  75%|███████████████     | 30/40 [07:22<02:55, 17.51s/it]

for params {'contamination': 0.2, 'eps': 0.8, 'min_samples': 25, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.23030244702892166
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  78%|███████████████▌    | 31/40 [07:42<02:45, 18.36s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 5, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.47505266775154326
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  80%|████████████████    | 32/40 [08:00<02:25, 18.16s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 5, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.34959830636099204
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  82%|████████████████▌   | 33/40 [08:19<02:10, 18.58s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 10, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.45287313256911466
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  85%|█████████████████   | 34/40 [08:37<01:50, 18.39s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 10, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.32118695829519495
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  88%|█████████████████▌  | 35/40 [08:56<01:33, 18.64s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 15, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.4093233741752455
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  90%|██████████████████  | 36/40 [09:14<01:12, 18.23s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 15, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.3033996500785667
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  92%|██████████████████▌ | 37/40 [09:34<00:56, 18.79s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 20, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.4960497792451874
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning:  95%|███████████████████ | 38/40 [09:51<00:36, 18.39s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 20, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.2502740654173078
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714


Hyperparameter Tuning:  98%|███████████████████▌| 39/40 [10:11<00:18, 18.66s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 25, 'n_estimators': 100, 'variance_ratio': 3} got the score of : 0.4739176663764398
in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 5
Total explained variance: 0.305490242784654


Hyperparameter Tuning: 100%|████████████████████| 40/40 [10:28<00:00, 15.72s/it]

for params {'contamination': 0.2, 'eps': 0.9, 'min_samples': 25, 'n_estimators': 100, 'variance_ratio': 5} got the score of : 0.27872941144193697
Best score: 0.502369925505771
Best parameters: {'contamination': 0.2, 'eps': 0.6, 'min_samples': 5, 'n_estimators': 100, 'variance_ratio': 3}





In [168]:
gc.collect()
tuner = dbscan_pipe_hyper_parameter_tuner(contamination=0.2, min_samples=10, n_estimators= 100, variance_ratio=3, eps=0.6, df=no_missing, random_state=1)
tuner.fit()
score = silhouette_score(tuner.PCA_df, tuner.model.labels_)
print (score)

in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714
0.41858500490492595


In [170]:
gc.collect()
tuner = dbscan_pipe_hyper_parameter_tuner(contamination=0.2, min_samples=5, n_estimators= 100, variance_ratio=3, eps=0.6, df=no_missing, random_state=1)
tuner.fit()
score = silhouette_score(tuner.PCA_df, tuner.model.labels_)
print (score)

in outlier detection n_estimators is 100 and the contamination is 0.2
in pca the variance ratio is 3
Total explained variance: 0.21385297021064714
0.4346444778621236
