In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, RocCurveDisplay, accuracy_score

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/Machine Learning/Explore Multi-Label Classification with an Enzyme Substrate Dataset/train.csv')
df_train.head()

Unnamed: 0,id,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,...,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2,EC1,EC2,EC3,EC4,EC5,EC6
0,0,323.390782,9.879918,5.875576,5.875576,4.304757,4.304757,2.754513,1.749203,0.0,...,4.794537,35.527357,0,0,1,1,0,0,0,0
1,1,273.723798,7.259037,4.441467,5.834958,3.285046,4.485235,2.201375,1.289775,45.135471,...,13.825658,44.70731,0,0,0,1,1,0,0,0
2,2,521.643822,10.911303,8.527859,11.050864,6.665291,9.519706,5.824822,1.770579,15.645394,...,17.964475,45.66012,0,0,1,1,0,0,1,0
3,3,567.431166,12.453343,7.089119,12.833709,6.478023,10.978151,7.914542,3.067181,95.639554,...,31.961948,87.509997,0,0,1,1,0,0,0,0
4,4,112.770735,4.414719,2.866236,2.866236,1.875634,1.875634,1.03645,0.727664,17.980451,...,9.589074,33.333333,2,2,1,0,1,1,1,0


In [None]:
def des_ana(dataframe):
    print('----------------------Shape----------------------------')
    print(f'{dataframe.shape}\n\n')
    print('----------------------Columns----------------------------')
    print(f'{dataframe.columns}\n\n')
    print('----------------------Info----------------------------')
    print(f'{dataframe.info()}\n\n')
    print('----------------------Descritive----------------------------')
    print(f'{dataframe.describe()}\n\n')
    print('----------------------Null Values----------------------------')
    print(f'{dataframe.isnull().sum()}\n\n')
    print('----------------------Unique Values----------------------------')
    print(f'{dataframe.nunique()}\n\n')

des_ana(df_train)

----------------------Shape----------------------------
(14838, 38)


----------------------Columns----------------------------
Index(['id', 'BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v',
       'Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
       'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
       'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
       'fr_COO', 'fr_COO2', 'EC1', 'EC2', 'EC3', 'EC4', 'EC5', 'EC6'],
      dtype='object')


----------------------Info----------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14838 entries, 0 to 14837
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 14838 non-null  int64  
 1   BertzCT         

In [None]:
df_train.drop(['id','EC3','EC4','EC5','EC6'], axis=1, inplace=True)

In [None]:
df_train['fr_COO'].value_counts()

0    9325
1    4332
2    1107
3      64
8       4
5       3
4       2
6       1
Name: fr_COO, dtype: int64

In [None]:
df_train['fr_COO2'].value_counts()

0    9311
1    4345
2    1108
3      64
8       4
5       3
4       2
6       1
Name: fr_COO2, dtype: int64

In [None]:
df_train['EC1'].value_counts()

1    9908
0    4930
Name: EC1, dtype: int64

In [None]:
df_train['EC2'].value_counts()

1    11855
0     2983
Name: EC2, dtype: int64

In [None]:
X = df_train.iloc[:,:-2].values
y = df_train.iloc[:,-2:].values

In [None]:
y1 = y[:,0]
y2 = y[:,1]

In [None]:
def scaler(X):
    standar_scale = StandardScaler()
    minmax_scale = MinMaxScaler()

    X[:,:-2] = standar_scale.fit_transform(X[:,:-2])
    X[:,-2:] = minmax_scale.fit_transform(X[:,-2:])
    return X

In [None]:
X = scaler(X)

In [None]:
X[:10,-2:]

array([[0.   , 0.   ],
       [0.   , 0.   ],
       [0.   , 0.   ],
       [0.   , 0.   ],
       [0.25 , 0.25 ],
       [0.125, 0.125],
       [0.   , 0.   ],
       [0.   , 0.   ],
       [0.   , 0.   ],
       [0.   , 0.   ]])

In [None]:
oversampler = RandomOverSampler()
X1, y1 = oversampler.fit_resample(X,y1)
X2, y2 = oversampler.fit_resample(X,y2)

# Hyperparameter

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
seed=42
hyper_models = {
    'Logistic Regression':{
        'model':LogisticRegression(),
        'params':{
            'max_iter':[100,200,300,500],
            "C": [0.001, 0.01, 0.1, 1, 10, 100],
            "solver": ["lbfgs", "liblinear", "newton-cg"]
        }
    },
    'Random Forest':{
        'model':RandomForestClassifier(random_state=seed),
        'params':{
            'criterion':['gini', 'entropy', 'log_loss'],
            'max_depth':[5,8,10,15],
            'n_estimators':[100,300,500]
        }
    },
    'XGBoots':{
        'model': XGBClassifier(random_state=seed),
        'params':{
            'n_estimators':[100,300,500],
            'max_depth':[5,8,10,15],
            'learning_rate':[0.1,0.15,0.2]
        }
    },
    'LGB':{
        'model': LGBMClassifier(random_state=seed),
        'params':{
            'n_estimators':[100,300,500],
            'max_depth':[5,8,10,15],
            'learning_rate':[0.1,0.15,0.2]
        }
    }
}

scores = []

for model_name, mp in hyper_models.items():
    clf = GridSearchCV(mp['model'], mp['params'], return_train_score=False)
    clf.fit(X1, y1)
    scores.append({
        'model':model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

results_train = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
results_train

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Number of positive: 7926, number of negative: 7926
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6422
[LightGBM] [Info] Number of data points in the train set: 15852, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 7926, number of negative: 7927
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6433
[LightGBM] [Info] Number of data points in the train set: 15853, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499968 -> initscore=-0.000126
[LightGBM] [Info] Start training from score -0.000126
[LightGBM] [Info] Number of positive: 7926, number of negative: 7927
You can set `force_row_wise=true` to remove the overhead.
And if memory is 

Unnamed: 0,model,best_score,best_params
0,Logistic Regression,0.638776,"{'C': 100, 'max_iter': 100, 'solver': 'lbfgs'}"
1,Random Forest,0.811315,"{'criterion': 'gini', 'max_depth': 15, 'n_esti..."
2,XGBoots,0.84109,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est..."
3,LGB,0.826707,"{'learning_rate': 0.2, 'max_depth': 15, 'n_est..."


In [19]:
for model_name, mp in hyper_models.items():
    clf = GridSearchCV(mp['model'], mp['params'], return_train_score=False)
    clf.fit(X2, y2)
    scores.append({
        'model':model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

results_train1 = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
results_train1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
[LightGBM] [Info] Number of positive: 9484, number of negative: 9484
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6442
[LightGBM] [Info] Number of data points in the train set: 18968, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 9484, number of negative: 9484
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6423
[LightGBM] [Info] Number of data points in the train set: 18968, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 9484, number of negative: 9484
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGB

Unnamed: 0,model,best_score,best_params
0,Logistic Regression,0.638776,"{'C': 100, 'max_iter': 100, 'solver': 'lbfgs'}"
1,Random Forest,0.811315,"{'criterion': 'gini', 'max_depth': 15, 'n_esti..."
2,XGBoots,0.84109,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est..."
3,LGB,0.826707,"{'learning_rate': 0.2, 'max_depth': 15, 'n_est..."
4,Logistic Regression,0.568536,"{'C': 100, 'max_iter': 300, 'solver': 'lbfgs'}"
5,Random Forest,0.903374,"{'criterion': 'gini', 'max_depth': 15, 'n_esti..."
6,XGBoots,0.956769,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est..."
7,LGB,0.931,"{'learning_rate': 0.2, 'max_depth': 15, 'n_est..."


In [20]:
df_test = pd.read_csv('/content/drive/MyDrive/Machine Learning/Explore Multi-Label Classification with an Enzyme Substrate Dataset/test.csv')
df_test.head()

Unnamed: 0,id,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,...,PEOE_VSA14,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,SMR_VSA10,SMR_VSA5,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2
0,14838,344.632371,7.283603,4.473966,5.834958,3.412257,4.65153,2.096558,1.116433,49.458581,...,13.512441,0.0,0.0,0.0,26.809272,24.5398,4.794537,47.304082,1,1
1,14839,1432.410201,10.663869,7.079026,8.065215,5.297097,5.297097,3.924155,2.569694,0.0,...,0.0,34.947374,98.323987,9.606882,0.0,53.378235,0.0,43.166667,0,0
2,14840,83.352608,3.931852,1.774215,1.774215,1.073446,1.073446,0.46783,0.170838,5.969305,...,5.969305,0.0,0.0,6.420822,11.75255,13.344559,9.589074,24.666667,1,1
3,14841,150.255712,5.91279,3.548812,3.548812,2.595128,2.595128,1.642813,0.694113,0.0,...,59.935299,0.0,0.0,0.0,17.744066,32.290168,4.794537,26.778866,0,0
4,14842,1817.276351,24.91094,15.540529,20.047314,12.535886,17.730988,11.979618,4.431173,84.554972,...,23.468091,25.609359,0.0,37.099,69.141353,38.70413,50.697492,102.583333,0,0


In [49]:
df_test.shape

(9893, 32)

In [21]:
df_test['fr_COO'].value_counts()

0    6246
1    2837
2     737
3      66
4       4
8       2
7       1
Name: fr_COO, dtype: int64

In [22]:
df_test['fr_COO2'].value_counts()

0    6234
1    2846
2     740
3      66
4       4
8       2
7       1
Name: fr_COO2, dtype: int64

In [23]:
X_test = df_test.drop('id', axis=1).values

In [24]:
X_test = scaler(X_test)

In [66]:
# Predicting for EC1
#xgb_model = XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=15)
xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=15)
xgb_model.fit(X1, y1)
y1_pred = xgb_model.predict_proba(X_test)

In [67]:
y1_pred.shape

(9893, 2)

In [68]:
y1_pred = y1_pred[:,1]

In [69]:
# Predicting for EC2
xgb_model.fit(X2, y2)
y2_pred = xgb_model.predict_proba(X_test)

In [70]:
y2_pred.shape

(9893, 2)

In [71]:
y2_pred = y2_pred[:,1]

In [72]:
results = np.hstack((np.reshape(y1_pred,(-1,1)),np.reshape(y2_pred,(-1,1))))

In [73]:
results.shape

(9893, 2)

In [74]:
results = pd.DataFrame(results, columns=['EC1','EC2'])

In [75]:
results.head()

Unnamed: 0,EC1,EC2
0,0.039647,0.973617
1,0.98481,0.985456
2,0.977435,0.82421
3,0.892554,0.933621
4,0.294243,0.993218


In [76]:
submission = pd.concat([df_test['id'],results], axis=1).to_csv('submission2.csv', index=False)