In [77]:
import pandas as pd  
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [78]:
def PreProcessing():
    path = sys.argv[1] 
    data = pd.read_csv(path)
    data1 = data.replace('?',np.nan)
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mean.fit(data1.loc[:, data1.columns != 'class'])
    imp_mean.transform(data1.loc[:, data1.columns != 'class'])
    data1.loc[:, data1.columns != 'class'] = imp_mean.transform(data1.loc[:, data1.columns != 'class'])
    normalisation = MinMaxScaler()
    normalisation.fit(data1.loc[:, data1.columns != 'class'])
    normalisation.transform(data1.loc[:, data1.columns != 'class'])
    data1.loc[:, data1.columns != 'class'] = normalisation.transform(data1.loc[:, data1.columns != 'class'])
    data1 = data1.replace('class1', 0)
    data1 = data1.replace('class2', 1)
    data1 = data1.round(4)
    x = data1.to_string(header=False,index=False,index_names=False).split('\n')
    vals = [','.join(ele.split()) for ele in x]
    print(*vals, sep = "\n")



In [79]:
def logregClassifier(X, y):
    logreg = LogisticRegression()
    cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(logreg, X, y, cv=cvKFold)
    print("{:.4f}".format(scores.mean()))

In [83]:
def nbClassifier(X,y):
    nb = GaussianNB()
    cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(nb, X, y, cv=cvKFold)
    print("{:.4f}".format(scores.mean()))

In [84]:
def kNNClassifier(X, y, K):
    knn = KNeighborsClassifier(n_neighbors=K)
    cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(knn, X, y, cv=cvKFold)
    print("{:.4f}".format(scores.mean()))

In [85]:
def gbClassifier(X, y, n_estimators, learning_rate):
    gb_clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(gb_clf, X, y, cv=cvKFold)
    print("{:.4f}".format(scores.mean()))

In [86]:
def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    ada_clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=max_depth,random_state=0,criterion='entropy'), n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(ada_clf, X, y, cv=cvKFold)
    print("{:.4f}".format(scores.mean()))

In [87]:
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    bag_clf = BaggingClassifier(
        DecisionTreeClassifier(max_depth=max_depth,random_state=0,criterion='entropy'), n_estimators=n_estimators,
        max_samples=max_samples, bootstrap=True, random_state=0)
    cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(bag_clf, X, y, cv=cvKFold)
    print("{:.4f}".format(scores.mean()))

In [88]:
def dtClassifier(X, y):
    tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
    cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(tree, X, y, cv=cvKFold)
    print("{:.4f}".format(scores.mean()))   

In [89]:
def bestLinClassifier(X,y):
    cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(SVC(kernel="linear"), param_grid, cv=cvKFold,return_train_score=True)
    grid_search.fit(X_train, y_train)
    for i in grid_search.best_params_:
        print(grid_search.best_params_[i])
    print("{:.4f}".format(grid_search.best_score_))
    print("{:.4f}".format(grid_search.score(X_test, y_test)))

In [90]:
def bestRFClassifier(X,y):
    cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    param_grid = {'n_estimators': [10,30],
                  'max_leaf_nodes': [4,16]} 
    grid_search = GridSearchCV(RandomForestClassifier(criterion='entropy',max_features='sqrt',random_state=0), param_grid, cv=cvKFold,return_train_score=True)
    grid_search.fit(X_train, y_train) 
    print(grid_search.best_params_['n_estimators'])
    print(grid_search.best_params_['max_leaf_nodes'])
    print("{:.4f}".format(grid_search.best_score_))
    print("{:.4f}".format(grid_search.score(X_test, y_test)))

In [91]:
algorithm = sys.argv[2]
def main():
    if algorithm == "P":
        PreProcessing()
    else:
        path = sys.argv[1] 
        data = pd.read_csv(path)
        X = data.loc[:, data.columns != 'class']
        y = data['class']
        if algorithm == "LR":
            logregClassifier(X,y)
            
        if algorithm == "NB":
            nbClassifier(X,y)
            
        if algorithm == "DT":
            dtClassifier(X,y) 
            
        if algorithm == "SVM":
            bestLinClassifier(X,y)
            
        if algorithm == "RF":
            bestRFClassifier(X,y)
            
        if algorithm == "NN":
            para = sys.argv[3]
            data1 = pd.read_csv(para, sep=',')
            K = data1['K'][0]
            kNNClassifier(X, y, K)
            
        if algorithm == "BAG":
            para = sys.argv[3]
            data1 = pd.read_csv(para, sep=',')  
            n_estimators = data1['n_estimators'][0]
            max_samples = data1['max_samples'][0]
            max_depth = data1['max_depth'][0]
            bagDTClassifier(X, y, n_estimators, max_samples, max_depth)
    
        if algorithm == "ADA":
            para = sys.argv[3]
            data1 = pd.read_csv(para, sep=',')  
            n_estimators = data1['n_estimators'][0]
            learning_rate = data1['learning_rate'][0]
            max_depth = data1['max_depth'][0]
            adaDTClassifier(X, y, n_estimators, learning_rate, max_depth)
            
        if algorithm == "GB":
            para = sys.argv[3]
            data1 = pd.read_csv(para, sep=',')  
            n_estimators = data1['n_estimators'][0]
            learning_rate = data1['learning_rate'][0]
            gbClassifier(X, y, n_estimators, learning_rate)


In [92]:
main()

FileNotFoundError: [Errno 2] No such file or directory: '-f'