In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [3]:
train_df = pd.read_csv('221112_AI4DR_Train_Hill_parameters.csv')
test_df = pd.read_csv('221112_AI4DR_Test_Hill_parameters.csv')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65000 entries, 0 to 64999
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    65000 non-null  int64  
 1   row ID        65000 non-null  object 
 2   CurveType     65000 non-null  object 
 3   Classe        65000 non-null  object 
 4   Classe_Index  65000 non-null  int64  
 5   CATOP         65000 non-null  int64  
 6   CANB          65000 non-null  int64  
 7   CASIG         65000 non-null  int64  
 8   CANT          65000 non-null  int64  
 9   CAHS          65000 non-null  int64  
 10  CNA           65000 non-null  int64  
 11  P             65000 non-null  int64  
 12  NT            65000 non-null  int64  
 13  LS            65000 non-null  int64  
 14  BA            65000 non-null  int64  
 15  BTOX          65000 non-null  int64  
 16  W             65000 non-null  int64  
 17  LU            65000 non-null  int64  
 18  r2            64885 non-nu

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    6500 non-null   int64  
 1   row ID        6500 non-null   object 
 2   CurveType     6500 non-null   object 
 3   Classe        6500 non-null   object 
 4   Classe_Index  6500 non-null   int64  
 5   CATOP         6500 non-null   int64  
 6   CANB          6500 non-null   int64  
 7   CASIG         6500 non-null   int64  
 8   CANT          6500 non-null   int64  
 9   CAHS          6500 non-null   int64  
 10  CNA           6500 non-null   int64  
 11  P             6500 non-null   int64  
 12  NT            6500 non-null   int64  
 13  LS            6500 non-null   int64  
 14  BA            6500 non-null   int64  
 15  BTOX          6500 non-null   int64  
 16  W             6500 non-null   int64  
 17  LU            6500 non-null   int64  
 18  r2            6490 non-null 

In [6]:
train_df = train_df.dropna()
test_df = test_df.dropna()

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64885 entries, 0 to 64999
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    64885 non-null  int64  
 1   row ID        64885 non-null  object 
 2   CurveType     64885 non-null  object 
 3   Classe        64885 non-null  object 
 4   Classe_Index  64885 non-null  int64  
 5   CATOP         64885 non-null  int64  
 6   CANB          64885 non-null  int64  
 7   CASIG         64885 non-null  int64  
 8   CANT          64885 non-null  int64  
 9   CAHS          64885 non-null  int64  
 10  CNA           64885 non-null  int64  
 11  P             64885 non-null  int64  
 12  NT            64885 non-null  int64  
 13  LS            64885 non-null  int64  
 14  BA            64885 non-null  int64  
 15  BTOX          64885 non-null  int64  
 16  W             64885 non-null  int64  
 17  LU            64885 non-null  int64  
 18  r2            64885 non-nu

In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6490 entries, 0 to 6499
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    6490 non-null   int64  
 1   row ID        6490 non-null   object 
 2   CurveType     6490 non-null   object 
 3   Classe        6490 non-null   object 
 4   Classe_Index  6490 non-null   int64  
 5   CATOP         6490 non-null   int64  
 6   CANB          6490 non-null   int64  
 7   CASIG         6490 non-null   int64  
 8   CANT          6490 non-null   int64  
 9   CAHS          6490 non-null   int64  
 10  CNA           6490 non-null   int64  
 11  P             6490 non-null   int64  
 12  NT            6490 non-null   int64  
 13  LS            6490 non-null   int64  
 14  BA            6490 non-null   int64  
 15  BTOX          6490 non-null   int64  
 16  W             6490 non-null   int64  
 17  LU            6490 non-null   int64  
 18  r2            6490 non-null 

In [9]:
train_df.columns

Index(['Unnamed: 0', 'row ID', 'CurveType', 'Classe', 'Classe_Index', 'CATOP',
       'CANB', 'CASIG', 'CANT', 'CAHS', 'CNA', 'P', 'NT', 'LS', 'BA', 'BTOX',
       'W', 'LU', 'r2', 'top', 'bottom', 'ec50', 'nh'],
      dtype='object')

In [10]:
param_grid = { 
    'n_estimators': [50, 100 ],
    'max_depth' : [20, 50, 100 ],
    'min_samples_leaf': [10, 20, 50],
    'min_samples_split': [10,20,50],
    'max_leaf_nodes' : [50, 100 ]
}

In [11]:
X_train = train_df[['r2', 'top', 'bottom', 'ec50', 'nh']].values
X_test = test_df[['r2', 'top', 'bottom', 'ec50', 'nh']].values

In [12]:
y_train = train_df[['CATOP', 'CANB', 'CASIG', 'CANT', 'CAHS', 'CNA', 'P', 'NT', 'LS', 'BA', 'BTOX', 'W', 'LU']].values
y_test = test_df[['CATOP', 'CANB', 'CASIG', 'CANT', 'CAHS', 'CNA', 'P', 'NT', 'LS', 'BA', 'BTOX', 'W', 'LU']].values

In [13]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
CV_classifier = GridSearchCV(estimator=RandomForestClassifier(n_jobs=8, bootstrap=True, random_state=42), param_grid=param_grid, cv= 5)
CV_classifier.fit(X_train, y_train)
y_pred = CV_classifier.predict(X_test)

In [15]:
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       500
           1       0.87      0.84      0.85       500
           2       0.88      0.82      0.85       500
           3       0.97      0.95      0.96       500
           4       0.91      0.80      0.85       500
           5       0.95      0.89      0.92       499
           6       0.95      0.95      0.95       500
           7       0.88      0.86      0.87       500
           8       0.94      0.94      0.94       495
           9       0.97      0.88      0.92       500
          10       0.88      0.66      0.76       500
          11       0.96      0.84      0.90       500
          12       0.99      0.98      0.98       496

   micro avg       0.93      0.87      0.90      6490
   macro avg       0.93      0.87      0.90      6490
weighted avg       0.93      0.87      0.90      6490
 samples avg       0.87      0.87      0.87      6490

0.871648690292758


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
CV_classifier.best_params_

{'max_depth': 20,
 'max_leaf_nodes': 100,
 'min_samples_leaf': 20,
 'min_samples_split': 10,
 'n_estimators': 100}

In [18]:
?y_pred

[0;31mType:[0m        ndarray
[0;31mString form:[0m
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]]
[0;31mLength:[0m      6490
[0;31mFile:[0m        /groups/idd/conda_envs/env_AI4DR_modeling/lib/python3.9/site-packages/numpy/__init__.py
[0;31mDocstring:[0m  
ndarray(shape, dtype=float, buffer=None, offset=0,
        strides=None, order=None)

An array object represents a multidimensional, homogeneous array
of fixed-size items.  An associated data-type object describes the
format of each element in the array (its byte-order, how many bytes it
occupies in memory, whether it is an integer, a floating point number,
or something else, etc.)

Arrays should be constructed using `array`, `zeros` or `empty` (refer
to the See Also section below).  The parameters given here refer to
a low-level method (`ndarray(...)`) for instantiating an array.

For more information, refer to the `numpy` module and examine the
meth

In [13]:
classifier = RandomForestClassifier(n_estimators=20, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [15]:
#print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       500
           1       0.93      0.89      0.91       500
           2       0.88      0.82      0.85       500
           3       0.97      0.98      0.98       500
           4       0.91      0.86      0.88       500
           5       0.96      0.93      0.95       499
           6       0.97      0.96      0.97       500
           7       0.93      0.92      0.92       500
           8       0.95      0.95      0.95       495
           9       0.98      0.94      0.96       500
          10       0.92      0.85      0.88       500
          11       0.96      0.91      0.93       500
          12       0.99      0.98      0.99       496

   micro avg       0.95      0.92      0.93      6490
   macro avg       0.95      0.92      0.93      6490
weighted avg       0.95      0.92      0.93      6490
 samples avg       0.92      0.92      0.92      6490

0.9204930662557781


  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
train_df[['CurveType', 'Classe', 'Classe_Index']].groupby('CurveType').first().sort_values('Classe_Index')

Unnamed: 0_level_0,Classe,Classe_Index
CurveType,Unnamed: 1_level_1,Unnamed: 2_level_1
CATOP,CATOP,0
CANB,CANB,1
CASIG,CASIG,2
CANT,CANT,3
CAHS,CAHS,4
CNA,CNA,5
P,P,6
NT,NT,7
LS,LS,8
BA,BA,9
