# ID2214/FID3214 Assignment 4 Group no. [1]
### Project members: 
[Patrik Zhong, pzhong@kth.se]
[Edvin Walleborn, edvinw@kth.se]
[Alexander Carlsson, alecarls@kth.se]


### Declaration
By submitting this solution, it is hereby declared that all individuals listed above have contributed to the solution, either with code that appear in the final solution below, or with code that has been evaluated and compared to the final solution, but for some reason has been excluded. It is also declared that all project members fully understand all parts of the final solution and can explain it upon request.

It is furthermore declared that the code below is a contribution by the project members only, and specifically that no part of the solution has been copied from any other source (except for lecture slides at the course ID2214/FID3214) and no part of the solution has been provided by someone not listed as project member above.

It is furthermore declared that it has been understood that no other library/package than the Python 3 standard library, NumPy, pandas, time and sklearn.tree, may be used in the solution for this assignment.

### Required to run/Declaration of used libraries
Uses rdkit. do the pip install


## Load NumPy, pandas, time and DecisionTreeClassifier from sklearn.tree

In [1]:
import numpy as np
import pandas as pd
import sklearn
import rdkit.Chem.rdMolDescriptors as d
import rdkit.Chem.Lipinski as l
import rdkit
from rdkit import Chem
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [2]:
# =========================================== COLUMN FILTER ===================================== #
# =========================================== COLUMN FILTER ===================================== #

globalTestList = [] # only used for print somewhere
def create_column_filter(df):
    dfnew = df.copy()    
    column_filter = []
    for col in dfnew.columns:
        if col == 'CLASS' or col == 'ID':
            column_filter.append(col)
            continue
        
        if dfnew[col].isna().all():
            dfnew.drop(labels=col, axis=1, inplace = True)
        
        elif(dfnew[col].dropna().nunique() <= 1):
            dfnew.drop(col, axis = 1, inplace = True)
        
        else:
            column_filter.append(col)
          
    return dfnew, column_filter

def apply_column_filter(df, column_filter):
    dfnew = df.copy()
    dfnew= dfnew.filter(items = column_filter, axis = 1)

    return dfnew

# =========================================== NORMALIZATION ===================================== #
# =========================================== NORMALIZATION ===================================== #

def create_normalization(df, normalizationtype):
    dfnew = df.copy()
    #display(dfnew)
    valueDict = {}


    if normalizationtype == 'zscore':
        for col in dfnew.columns:
            
            not_float = not pd.api.types.is_float_dtype(dfnew[col].dtypes)
            not_int64 = not pd.api.types.is_int64_dtype(dfnew[col].dtypes)

            if col == 'CLASS' or col == 'ID' or not_float and not_int64:
                continue
       
            meanVal = dfnew[col].mean()
            stdVal = dfnew[col].std()
            dfnew[col] = dfnew[col].apply(lambda x: (x-meanVal)/stdVal)
            valueDict[col] = ("zscore", meanVal, stdVal)

    elif normalizationtype == 'minmax':
        for col in dfnew.columns:
                
            not_float = not pd.api.types.is_float_dtype(dfnew[col].dtypes)
            not_int64 = not pd.api.types.is_int64_dtype(dfnew[col].dtypes)

            if col == 'CLASS' or col == 'ID' or not_float and not_int64:
                continue
           
            minVal = dfnew[col].min()
            maxVal = dfnew[col].max()
            # for count, value in enumerate(dfnew[col]):
            #     dfnew[col][count] = (value-minVal)/(maxVal-minVal) 
            dfnew[col] = dfnew[col].apply(lambda value: (value-minVal)/(maxVal-minVal)) 
            valueDict[col] = ("minmax", minVal, maxVal) 
                           

    else: 
        print("Normalization type not recognized")

    return dfnew, valueDict

def apply_normalization(df, normalization):
    dfnew = df.copy()
    for col in dfnew.columns:
        not_float = not pd.api.types.is_float_dtype(dfnew[col].dtypes)
        not_int64 = not pd.api.types.is_int64_dtype(dfnew[col].dtypes)

        if col == 'CLASS' or col == 'ID' or not_float and not_int64:
            continue

        normalizationType = normalization[col][0] #minmax eller ztype       

        if normalizationType == "minmax":          
            minVal = normalization[col][1]
            maxVal = normalization[col][2]
            dfnew[col] = dfnew[col].apply(lambda value: (value-minVal)/(maxVal-minVal))
            dfnew[col] = dfnew[col].apply(lambda x: 0.0 if x<0.0 else (1.0 if x > 1.0 else x))

        if normalizationType == "zscore":
            meanVal = normalization[col][1]
            stdVal = maxVal = normalization[col][2]
            dfnew[col] = df[col].apply(lambda x: (x-meanVal)/stdVal)

    return dfnew

# =========================================== IMPUTATION ===================================== #
# =========================================== IMPUTATION ===================================== #

def create_imputation(df):
    dfnew = df.copy()
    mappingDict = {}
    #display(dfnew)
    for col in dfnew.columns:
        is_float = pd.api.types.is_float_dtype(dfnew[col].dtypes)
        is_int64 = pd.api.types.is_int64_dtype(dfnew[col].dtypes)
        is_object = pd.api.types.is_object_dtype(dfnew[col].dtypes)
        is_category = pd.api.types.is_categorical_dtype(dfnew[col].dtypes)

        #print(col)
        #print(dfnew[col].dtypes)

        if col == 'CLASS' or col == 'ID':
            continue
        elif (is_int64):
            mean = dfnew[col].mean()
            if pd.isna(mean):
                mean = 0
            dfnew[col].fillna(value=mean, inplace=True)
            mappingDict[col] = (mean)
        elif(is_float):
            mean = dfnew[col].mean()
            if pd.isna(mean):
                mean = 0.0
            dfnew[col].fillna(value=mean, inplace=True)
            mappingDict[col] = (mean)
        elif (is_object):
            mode = dfnew[col].mode(dropna=True)[0]
            if pd.isna(mode):
                mode = ""
            dfnew[col].fillna(value=mode, inplace=True)
            mappingDict[col] = (mode)
        elif(is_category):
            mode = dfnew[col].mode(dropna=True)[0]
            if pd.isna(mode):
                mode = dfnew[col].categories[0]
            dfnew[col].fillna(value=mode, inplace=True)
            mappingDict[col] = (mode)
        else:
            print("no match on dtype in if-statement in (create_imputation)")

    #display(dfnew)
    return dfnew, mappingDict


def apply_imputation(df, imputation):
        dfnew = df.copy()
        for col in dfnew.columns:
            if col == 'CLASS' or col == 'ID':
                continue
            else:
                dfnew[col].fillna(value=imputation[col], inplace=True)
        return dfnew

# =========================================== DISCRETIZATION/BINNING ===================================== #
# =========================================== DISCRETIZATION/BINNING ===================================== #

def find_bin(x, binsVar):

    for i, binn in enumerate(binsVar):
        if x > binn and x <= binsVar[i+1]:
            #print(i)
            if not i in globalTestList:
                globalTestList.append(i)
            return i

def create_bins(df, nobins, bintype):
    dfnew = df.copy()
    binning = {}
    
    #display(dfnew)

    for col in dfnew.columns:

        not_float = not pd.api.types.is_float_dtype(dfnew[col].dtypes)
        not_int64 = not pd.api.types.is_int64_dtype(dfnew[col].dtypes)

        if col == 'CLASS' or col == 'ID' or not_float and not_int64:
            #Discretize is ignored
            continue

        res = None
        binsVar = None
        #Discretize columns from here on
        if bintype == "equal-width":
            res, binsVar = pd.cut(dfnew[col], bins=nobins, labels=False, retbins=True)
        elif bintype == "equal-size":
            #duplicates are set to drop due to error. discuss and lookup
            res, binsVar = pd.qcut(dfnew[col], q=nobins, duplicates="drop", labels=False, retbins=True)
        else:
            print("Not recognizable bintype to create_bins")
        
        binsVar[0] = -np.inf
        binsVar[-1] = np.inf
        binning[col] = binsVar 

        dfnew[col] = dfnew[col].apply(lambda x: find_bin(x, binsVar))
        dfnew[col] = dfnew[col].astype("category")

    return df, binning

def apply_bins(df, binning):
    #hint1
    dfnew = df.copy()

    for col in dfnew.columns:

        not_float = not pd.api.types.is_float_dtype(dfnew[col].dtypes)
        not_int64 = not pd.api.types.is_int64_dtype(dfnew[col].dtypes)

        if col == 'CLASS' or col == 'ID' or not_float and not_int64:
            #Discretize is ignored
            continue

        #hint2
        res, binsVar = pd.cut(dfnew[col], bins=binning[col], labels=False, retbins=True)
        
        #apply bins to output instructions
        dfnew[col] = dfnew[col].apply(lambda x: find_bin(x, binsVar))

        #hint3
        dfnew[col] = dfnew[col].astype("category")

        #hint4 already achieved. see print
        #print("has0: {}\nhas1: {}\nhas2: {}\nhas3: {}\nhas4: {}\nhas5: {}\nhas6: {}\nhas7: {}\nhas8: {}\nhas9: {}\nmin: {}\nmax: {}".format(0 in res, 1 in res, 2 in res, 3 in res, 4 in res, 5 in res, 6 in res, 7 in res, 8 in res, 9 in res, res.min(), res.max()))

    return dfnew

In [3]:
from platform import python_version

print(f"Python version: {python_version()}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"sklearn version: {sklearn.__version__}")
print(f"rdkit version: {rdkit.__version__}")


Python version: 3.9.13
NumPy version: 1.21.5
Pandas version: 1.4.4
sklearn version: 1.0.2
rdkit version: 2022.09.1


## 1. First box of Assignent 4

In [5]:
# load trainset fingerprints
loaded_fprints = pd.read_csv("train_fprints.csv")
loaded_fprints.drop(columns=["Unnamed: 0"], inplace = True)

for col in loaded_fprints.columns:
    loaded_fprints.rename(columns = {col : 'fingerprint_col_' + col}, inplace = True)
loaded_fprints.rename(columns = {'fingerprint_col_0' : 'ACTIVE'}, inplace = True)

# load testset fingerprints
loaded_fprints_test = pd.read_csv("test_fprints.csv")
loaded_fprints_test.drop(columns=["Unnamed: 0"], inplace = True)

for col in loaded_fprints_test.columns:
    loaded_fprints.rename(columns = {col : 'fingerprint_col_' + col}, inplace = True)

display(loaded_fprints)



Unnamed: 0,ACTIVE,fingerprint_col_1,fingerprint_col_2,fingerprint_col_3,fingerprint_col_4,fingerprint_col_5,fingerprint_col_6,fingerprint_col_7,fingerprint_col_8,fingerprint_col_9,...,fingerprint_col_114,fingerprint_col_115,fingerprint_col_116,fingerprint_col_117,fingerprint_col_118,fingerprint_col_119,fingerprint_col_120,fingerprint_col_121,fingerprint_col_122,fingerprint_col_123
0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,1,0
1,0,0,1,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
2,0,1,0,1,1,1,1,1,0,0,...,1,0,0,0,0,0,0,1,1,1
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
4,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156253,0,0,1,0,0,1,0,1,1,1,...,1,1,1,0,0,0,1,0,1,0
156254,1,0,1,0,1,1,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
156255,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
156256,1,0,1,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0


### Valideringsset uppdelning

In [6]:
from sklearn.impute import SimpleImputer

# def dataprep(df):
#     df1 = df.copy()
#     df1.pop('SMILES')
#     dframe = pd.DataFrame(SimpleImputer().fit_transform(df1), columns = df1.columns)

#     normalized_df = (df1-df1.min())/(df1.max()-df1.min())
#     # normalized_df = normalized_df.pop('SMILES')
    
#     return normalized_df

In [7]:
#kodruta
def split(df):
    training_df, test_df = train_test_split(df, test_size=0.2)
    return training_df, test_df

# Create the files and split

In [8]:
train_df = pd.read_csv("train_w_features.csv")
test_df = pd.read_csv("test_w_features.csv")
train_df_new, validation_df = split(train_df)

# prints for showing class balance in splits
#print("train")
#print(train_df_new["ACTIVE"].value_counts())
#print("0/len",train_df_new["ACTIVE"].value_counts()[0]/len(train_df_new))
#print("1/len",train_df_new["ACTIVE"].value_counts()[1]/len(train_df_new))

#print("val")
#print(validation_df["ACTIVE"].value_counts())
#print("0/len",validation_df["ACTIVE"].value_counts()[0]/len(validation_df))
#print("1/len",validation_df["ACTIVE"].value_counts()[1]/len(validation_df))



fp_train, fp_validation = split(loaded_fprints)




### RF HYPERPARAMETER

In [None]:
##MOLECULAR 
from sklearn.model_selection import GridSearchCV

def hyperparametersRF(dftrain):
    Y_train =  dftrain['ACTIVE']
    X_train = dftrain.drop(columns = ['INDEX', 'SMILES', 'ACTIVE'])
    # X_test = dftest.drop(columns = ['INDEX', 'SMILES'])
    rfc = RandomForestClassifier(n_jobs=-1, max_features = 'sqrt', n_estimators=50, oob_score = 1)
    param_grid = {'n_estimators': [200, 300, 1000, 1200], 'max_features': ['log2', 'sqrt']}

    norm_train, dummy = create_normalization(X_train, normalizationtype="minmax")

    CV_rfc = GridSearchCV(estimator = rfc, param_grid = param_grid, scoring = 'roc_auc', cv = 5)
    CV_rfc.fit(norm_train, Y_train)
    print(CV_rfc.best_params_)

# results given are:
# {'max_features': 'auto', 'n_estimators': 1000}
hyperparametersRF(train_df_new)


exception calling callback for <Future at 0x1cd5bc4b550 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "D:\Anaconda\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "D:\Anaconda\lib\site-packages\joblib\parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "D:\Anaconda\lib\site-packages\joblib\parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "D:\Anaconda\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "D:\Anaconda\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "D:\Anaconda\lib\site-packages\joblib\_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "D:\Anaconda\lib\site-packages\joblib\externals\loky\reusable_executor.

In [None]:
##FINGERPRINTS 
from sklearn.model_selection import GridSearchCV

def hyperparametersRF(dftrain):
    Y_train =  dftrain['ACTIVE']
    X_train = dftrain.drop(columns = ['ACTIVE'])

    rfc = RandomForestClassifier(n_jobs=-1, max_features = 'sqrt', n_estimators=50, oob_score = 1)
    param_grid = {'n_estimators': [200, 300, 1000, 1200], 'max_features': ['log2', 'sqrt']}

    norm_train, dummy = create_normalization(X_train, normalizationtype="minmax")

    CV_rfc = GridSearchCV(estimator = rfc, param_grid = param_grid, scoring = 'roc_auc', cv = 5)
    CV_rfc.fit(norm_train, Y_train)
    print(CV_rfc.best_params_)
    

# results given are:
# {'max_features': 'auto', 'n_estimators': 1000}
hyperparametersRF(fp_train)

# KNN HYPERPARAMETER

In [None]:
#MOLECULAR FEATURES
from sklearn.model_selection import GridSearchCV
#from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier

def hyperparametersKNN(dftrain):
    Y_train =  dftrain['ACTIVE']
    X_train = dftrain.drop(columns = ['INDEX', 'SMILES', 'ACTIVE'])
    # X_test = dftest.drop(columns = ['INDEX', 'SMILES'])
    knc = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, 
        p=2, metric='minkowski', metric_params=None, n_jobs=None)
    
    norm_train, dummy = create_normalization(X_train, normalizationtype="minmax")

    param_grid = {'n_neighbors': [1, 2, 5, 10, 20, 40, 80, 160], 'weights': ['uniforms', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': [10, 20, 30, 40, 50]}

    CV_knc = GridSearchCV(estimator = knc, param_grid = param_grid, scoring = 'roc_auc', cv = 5)
    CV_knc.fit(norm_train, Y_train)
    print(CV_knc.best_params_)
    return [1,2,3]

# results given are:
# {'max_features': 'auto', 'n_estimators': 1000}

hyperparametersKNN(train_df_new)

In [None]:
#FINGERPRINTS
from sklearn.model_selection import GridSearchCV
#from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier

def hyperparametersKNN(dftrain):
    Y_train =  dftrain['ACTIVE']
    X_train = dftrain.drop(columns = ['ACTIVE']) 
    knc = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, 
        p=2, metric='minkowski', metric_params=None, n_jobs=None)
    
    norm_train, dummy = create_normalization(X_train, normalizationtype="minmax")

    param_grid = {'n_neighbors': [1, 2, 5, 10, 20, 40, 80, 160], 'weights': ['uniforms', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': [10, 20, 30, 40, 50]}

    CV_knc = GridSearchCV(estimator = knc, param_grid = param_grid, scoring = 'roc_auc', cv = 5)
    CV_knc.fit(norm_train, Y_train)
    print(CV_knc.best_params_)
    return [1,2,3]

# results given are:
# {'max_features': 'auto', 'n_estimators': 1000}

hyperparametersKNN(fp_train)

### Algorithms and Models



In [None]:
##MOLEKYLER 
def randomforest(dftrain, dftest):
    dftrain1 = dftrain.copy()
    dftest1 = dftest.copy()
    Y_train =  dftrain1['ACTIVE']
    X_train = dftrain1.drop(columns = ['INDEX', 'SMILES', 'ACTIVE'])
    Y_test = dftest1['ACTIVE']
    X_test = dftest1.drop(columns = ['INDEX', 'SMILES', 'ACTIVE'])
    

    norm_train, dummy = create_normalization(X_train, normalizationtype="minmax")
    norm_test, dummy = create_normalization(X_test, normalizationtype="minmax")


    rf_model = RandomForestClassifier(n_estimators=50, max_features="auto", random_state=44)
    rf_model.fit(norm_train, Y_train)

    # predictions = rf_model.predict_proba(X_test)
    
    # predictionsdf = pd.DataFrame(predictions, columns=['0', '1'])
    print(metrics.roc_auc_score(Y_test, rf_model.predict(norm_test)))
    # print(predictionsdf)

randomforest(train_df_new, validation_df)  


In [None]:
##FINGERPRINTS
def randomforest(dftrain, dftest):
    dftrain1 = dftrain.copy()
    dftest1 = dftest.copy()
    Y_train =  dftrain1['ACTIVE']
    X_train = dftrain1.drop(columns = ['ACTIVE'])
    Y_test = dftest1['ACTIVE']
    X_test = dftest1.drop(columns = ['ACTIVE'])
    

    imputed_X_train = pd.DataFrame(SimpleImputer().fit_transform(X_train), columns = X_train.columns)
    normalized_X_train = (imputed_X_train-imputed_X_train.min())/(imputed_X_train.max()-imputed_X_train.min())
    
    imputed_X_test = pd.DataFrame(SimpleImputer().fit_transform(X_test), columns = X_test.columns)
    normalized_X_test = (imputed_X_test-imputed_X_test.min())/(imputed_X_test.max()-imputed_X_test.min())


    rf_model = RandomForestClassifier(n_estimators=50, max_features="auto", random_state=44)
    rf_model.fit(normalized_X_train, Y_train)

    # predictions = rf_model.predict_proba(X_test)
    
    # predictionsdf = pd.DataFrame(predictions, columns=['0', '1'])
    print(metrics.roc_auc_score(Y_test, rf_model.predict(normalized_X_test)))
    # print(predictionsdf)

randomforest(train_df_new, validation_df)  

In [None]:
##MOLEKYLER 
def randomforest(dftrain, dftest):
    dftrain1 = dftrain.copy()
    dftest1 = dftest.copy()
    Y_train =  dftrain1['ACTIVE']
    X_train = dftrain1.drop(columns = ['ACTIVE'])
    Y_test = dftest1['ACTIVE']
    X_test = dftest1.drop(columns = ['ACTIVE'])
    

    # imputed_X_train = pd.DataFrame(SimpleImputer().fit_transform(X_train), columns = X_train.columns)
    # normalized_X_train = (imputed_X_train-imputed_X_train.min())/(imputed_X_train.max()-imputed_X_train.min())
    
    # imputed_X_test = pd.DataFrame(SimpleImputer().fit_transform(X_test), columns = X_test.columns)
    # normalized_X_test = (imputed_X_test-imputed_X_test.min())/(imputed_X_test.max()-imputed_X_test.min())
    
    norm_train, dummy = create_normalization(X_train, normalizationtype="minmax")
    norm_test, dummy = create_normalization(X_test, normalizationtype="minmax")



    rf_model = RandomForestClassifier(n_estimators=50, max_features="auto", random_state=44)
    rf_model.fit(norm_train, Y_train)

    # predictions = rf_model.predict_proba(X_test)
    
    # predictionsdf = pd.DataFrame(predictions, columns=['0', '1'])
    print(metrics.roc_auc_score(Y_test, rf_model.predict(norm_test)))
    # print(predictionsdf)

randomforest(fp_train, fp_validation) 

In [None]:
def hyperparameterBayes(dftrain):
    Y_train =  dftrain['ACTIVE']
    X_train = dftrain.drop(columns = ['INDEX', 'SMILES', 'ACTIVE'])
    # X_test = dftest.drop(columns = ['INDEX', 'SMILES'])
    clf = ComplementNB()
    param_grid = {'n_estimators': [100, 200, 300, 1000], 'max_features': ['auto', 'sqrt', 'log2']}

    CV_rfc = GridSearchCV(estimator = rfc, param_grid = param_grid, scoring = 'roc_auc', cv = 5)
    CV_rfc.fit(X_train, Y_train)
    print(CV_rfc.best_params_)
    return [1,2,3]

# results given are:
# {'max_features': 'auto', 'n_estimators': 1000}
hyperparametersRF(train_df_new)


In [None]:

from sklearn.naive_bayes import ComplementNB
def cnb(dftrain, dftest):    
    dftrain1 = dftrain.copy()
    dftest1 = dftest.copy()
    Y_train =  dftrain1['ACTIVE']
    #X_train = dftrain1.drop(columns = ['INDEX', 'SMILES', 'ACTIVE'])
    X_train = dftrain1.drop(columns = ['ACTIVE'])
    Y_test = dftest1['ACTIVE']
    #X_test = dftest1.drop(columns = ['INDEX', 'SMILES', 'ACTIVE'])
    X_test = dftest1.drop(columns = ['ACTIVE'])

    norm_train, dummy = create_normalization(X_train, normalizationtype="minmax")
    norm_test, dummy = create_normalization(X_test, normalizationtype="minmax")

    disc_train, dummy = create_bins(norm_train, nobins=10, bintype="equal-width")
    disc_validation, dummy = create_bins(norm_test, nobins=10, bintype="equal-width")

    clf = ComplementNB()
    #clf.fit(normalized_X_train, Y_train)
    clf.fit(disc_train, Y_train)
    ComplementNB()
    # predictions=clf.predict_proba(X_test)
    # predictionsdf = pd.DataFrame(predictions, columns=['0', '1'])
    # print(metrics.roc_auc_score(Y_test, clf.predict(normalized_X_test)))
    print(metrics.roc_auc_score(Y_test, clf.predict(disc_validation)))

#cnb(train_df_new, validation_df)
cnb(fp_train, fp_validation)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer

def knn(dftrain, dftest):
    dftrain1 = dftrain.copy()
    dftest1 = dftest.copy()
    Y_train =  dftrain1['ACTIVE']
    X_train = dftrain1.drop(columns = ['INDEX', 'SMILES', 'ACTIVE'])
    Y_test = dftest1['ACTIVE']
    X_test = dftest1.drop(columns = ['INDEX', 'SMILES', 'ACTIVE'])
    

    n_X_train, dummy = create_normalization(X_train, 'minmax')
    n_X_test, dummy1 = create_normalization(X_test, 'minmax')
    # imputed_X_train = pd.DataFrame(SimpleImputer().fit_transform(X_train), columns = X_train.columns)
    # normalized_X_train = (imputed_X_train-imputed_X_train.min())/(imputed_X_train.max()-imputed_X_train.min())
    
    # imputed_X_test = pd.DataFrame(SimpleImputer().fit_transform(X_test), columns = X_test.columns)
    # normalized_X_test = (imputed_X_test-imputed_X_test.min())/(imputed_X_test.max()-imputed_X_test.min())


    neigh = KNeighborsClassifier(n_neighbors=2)
    neigh.fit(n_X_train, Y_train)

    # print(neigh.predict_proba(X_test))
    print(metrics.roc_auc_score(Y_test, neigh.predict(n_X_test)))
knn(train_df_new, validation_df)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer

def knn(dftrain, dftest):
    dftrain1 = dftrain.copy()
    dftest1 = dftest.copy()
    Y_train =  dftrain1['ACTIVE']
    X_train = dftrain1.drop(columns = ['ACTIVE'])
    Y_test = dftest1['ACTIVE']
    X_test = dftest1.drop(columns = ['ACTIVE'])    

    n_X_train, dummy = create_normalization(X_train, 'minmax')
    n_X_test, dummy1 = create_normalization(X_test, 'minmax')
    # imputed_X_train = pd.DataFrame(SimpleImputer().fit_transform(X_train), columns = X_train.columns)
    # normalized_X_train = (imputed_X_train-imputed_X_train.min())/(imputed_X_train.max()-imputed_X_train.min())
    
    # imputed_X_test = pd.DataFrame(SimpleImputer().fit_transform(X_test), columns = X_test.columns)
    # normalized_X_test = (imputed_X_test-imputed_X_test.min())/(imputed_X_test.max()-imputed_X_test.min())


    neigh = KNeighborsClassifier(n_neighbors=2)
    neigh.fit(n_X_train, Y_train)

    # print(neigh.predict_proba(X_test))
    print(metrics.roc_auc_score(Y_test, neigh.predict(n_X_test)))
knn(fp_train, fp_validation)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d5b21b1a-ec26-41e0-8ae3-fe1bc5b86ef4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>