In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import joblib

In [4]:
#Installing Packages
#pip install xgboost
#!pip install joblib

In [5]:
data = pd.read_csv('data_clusters.csv')
data

Unnamed: 0,Sensor-1,Sensor-2,Sensor-3,Sensor-4,Sensor-5,Sensor-7,Sensor-8,Sensor-9,Sensor-10,Sensor-11,...,Sensor-583,Sensor-584,Sensor-585,Sensor-586,Sensor-587,Sensor-588,Sensor-589,Sensor-590,Clusters,Labels
0,3045.98,2544.85,2253.3444,1873.8678,1.1691,94.9722,0.1210,1.4752,0.0084,0.0009,...,0.4948,0.0123,0.0033,2.4804,0.0291,0.0102,0.0033,35.0279,0,-1.0
1,3151.98,2563.75,2175.2556,1022.1660,1.2833,100.6222,0.1250,1.4536,-0.0110,-0.0035,...,0.5063,0.0113,0.0031,2.2284,0.0291,0.0102,0.0033,35.0279,2,-1.0
2,3071.18,2489.86,2195.3000,1151.8233,0.9220,103.5467,0.1232,1.4826,0.0136,0.0009,...,0.5033,0.0112,0.0029,2.2320,0.0291,0.0102,0.0033,35.0279,0,-1.0
3,2958.46,2523.78,2171.8556,1156.6018,1.4025,100.1367,0.1243,1.4645,0.0001,0.0067,...,0.4954,0.0136,0.0033,2.7511,0.0291,0.0102,0.0033,35.0279,0,-1.0
4,3196.21,2413.39,2255.5222,1763.0739,1.2226,101.5878,0.1200,1.4845,-0.0037,0.0049,...,0.5058,0.0094,0.0026,1.8673,0.0053,0.0188,0.0057,353.8319,0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832,3169.00,2265.60,2187.9889,1096.3790,0.9065,97.6567,0.1221,1.4109,-0.0239,-0.0121,...,0.4912,0.0127,0.0039,2.5801,0.0166,0.0122,0.0036,73.6335,1,-1.0
833,2923.19,2516.40,2180.8889,1084.7221,0.9085,94.2467,0.1226,1.3137,0.0345,-0.0137,...,0.4939,0.0163,0.0037,3.2914,0.0166,0.0122,0.0036,73.6335,0,-1.0
834,3003.43,2448.34,2205.5000,1287.3538,2.3842,111.7644,0.1241,1.4339,0.0020,-0.0036,...,0.4966,0.0084,0.0020,1.6866,0.0166,0.0122,0.0036,73.6335,2,-1.0
835,2953.63,2500.05,2195.9778,1388.2869,1.5605,103.2400,0.1234,1.5177,0.0183,-0.0137,...,0.5009,0.0135,0.0035,2.7038,0.0178,0.0305,0.0106,171.3183,0,-1.0


In [6]:
list_of_clusters = data['Clusters'].unique()
list_of_clusters

array([0, 2, 1], dtype=int64)

In [7]:
# Getting best paramaters for Random Forest
def get_best_param_for_random_forest(x_train, y_train):
    
    #initialise the model
    clf = RandomForestClassifier()
    
    param_grid = {
            'n_estimators': [10,50,100,150],
            "criterion" : ['gini' , 'entropy'],
            "max_depth" : range(2,4),
            "max_features" : ['auto' , 'log2']
    }
        
    grid = GridSearchCV(estimator=clf, param_grid = param_grid, cv = 5 , verbose = 0)
    print('[Info] Model Grid search training started')
    grid.fit(x_train , y_train)
    
    
    criterion = grid.best_params_['criterion']
    n_estimators = grid.best_params_['n_estimators']
    max_depth = grid.best_params_['max_depth']
    max_features = grid.best_params_['max_features']
    
    # train the random forest
    clf = RandomForestClassifier(n_estimators = n_estimators , 
                                 criterion = criterion , 
                                 max_depth=max_depth , 
                                 max_features=max_features)
    print('[Info] Model training started')
    
    clf.fit(x_train , y_train)
    
    return clf

In [9]:
# Getting best paramaters for XG Boost
def get_best_param_for_xgboost(x_train, y_train):
    
    xgb = XGBClassifier(objective='binary:logistic')
    
    param_grid_xgboost = {
        'learning_rate' :[0.5,0.1,0.01,0.001],
        'max_depth':[3,5,10,20],
        'n_estimators':[10,50,100,200]
    }
    
    grid = GridSearchCV(estimator=xgb, param_grid = param_grid_xgboost, cv = 5 , verbose = 0)
    grid.fit(x_train , y_train)
    
    learning_rate = grid.best_params_['learning_rate']
    n_estimators = grid.best_params_['n_estimators']
    max_depth = grid.best_params_['max_depth']
    
    xgb = XGBClassifier(learning_rate = learning_rate , n_estimators=n_estimators , max_depth= max_depth)
    xgb.fit(x_train, y_train)
    
    return xgb

In [10]:
def get_best_model(x_train , x_test, y_train , y_test):
    
    # random forest
    random_forest = get_best_param_for_random_forest(x_train,  y_train)
    random_forest_prediction = random_forest.predict(x_test)
    random_forest_score = accuracy_score(y_test ,random_forest_prediction )
    
    #xgboost
    xg_boost = get_best_param_for_xgboost(x_train,  y_train)
    xg_boost_prediction = xg_boost.predict(x_test)
    xg_boost_score = accuracy_score(y_test ,xg_boost_prediction )
    
    if xg_boost_score > random_forest_score:
        return 'xgboost' , xg_boost
    else:
        return 'random_forest' , random_forest

In [11]:
mkdir models

A subdirectory or file models already exists.


In [13]:
### passing all the clusters and looking for the best ML algorithm to fit on individual clusters

for i in list_of_clusters:
    cluster_data = data[data['Clusters']==i] #filter the data for one cluster
    
    # prepare a data
    
    cluster_features = cluster_data.drop(['Labels' , 'Clusters'], axis = 1)
    cluster_label = cluster_data['Labels']
    
    # split the data
    x_train , x_test, y_train , y_test = train_test_split(cluster_features, cluster_label , test_size = 1/3, random_state=101)
    model_name , model = get_best_model(x_train , x_test, y_train , y_test)
    print('[INFO] model Trained')
    
    
    #Saving the ML model which is having high accuracy
    joblib.dump(model, f'models/{model_name}_{i}.pkl')


[Info] Model Grid search training started
[Info] Model training started
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































[INFO] model Trained
[Info] Model Grid search training started




[Info] Model training started




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































[INFO] model Trained
[Info] Model Grid search training started
[Info] Model training started













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































[INFO] model Trained
