## Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')
#Ignore future version warning (version warning)

In [None]:
#import modules
import sklearn as sk
import seaborn as sns
import numpy as np
import keras_tuner as kt
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from tensorflow import keras
from sklearn.decomposition import PCA

# REGRESSION

## Data handling

In [None]:
data = pd.read_csv("winequality-red.csv")

In [None]:
# clean data
d = preprocessing.normalize(data)
df = pd.DataFrame(d, columns=data.columns)
df.head()

In [None]:
 #Seperate dependant and independant variable
data_input=df.drop(['quality'],axis=1).values
data_output=df['quality'].values

## Linear Regression

In [None]:
#Train model
ml=LinearRegression()
ml.fit(x_train,y_train)

In [None]:
y_pred=ml.predict(x_test)

In [None]:
#Eval on hyper-parameters of LinearRegression
gs_LinearRegression = GridSearchCV(LinearRegression(),{
    'fit_intercept': [False,True], #Default = False
    'n_jobs': [0,1,10,50],
}, cv=10, return_train_score=False, scoring = 'neg_mean_squared_error')
gs_LinearRegression.fit(x_train,y_train)

In [None]:
dfgs = pd.DataFrame(gs_LinearRegression.cv_results_)
dfgs[['param_fit_intercept','param_n_jobs', 'mean_test_score']]
#Having 'param_fit_intercept' = True improves accuracy

## Support Vector Machines

In [None]:
#Eval on hyper-parameters of LinearRegression
gs_SVM = GridSearchCV(SVR(),{
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], #default rbf
    'degree' : [2,3], #default = 3, only applys to kernel: poly
}, cv=10, return_train_score=False, scoring = 'neg_mean_squared_error')
gs_SVM.fit(Xs_train,y_train)

## Decision Trees

In [None]:
gs_DTR = GridSearchCV(DecisionTreeRegressor(),{
    'criterion' : ['squared_error','friedman_mse','absolute_error','poisson'],
    'splitter' : ['best','random'], 
}, cv=10, return_train_score=False, scoring = 'neg_mean_squared_error')
gs_DTR.fit(X_train,y_train)

##  Multi-Layer Perceptron Neural Network.

In [None]:
mlp = MLPRegressor(max_iter=10000, activation='relu')
mlp

In [None]:
mlp.fit(x_train2,y_train2)

In [None]:
y_pred2 = mlp.predict(x_test2)

In [None]:
#Eval on hyper-parameters of MLPRegressor
x_train2,x_test2,y_train2,y_test2=train_test_split(data_input,data_output,test_size=0.3,random_state=1)
gs_MLP = GridSearchCV( MLPRegressor(),{
    'activation': ['identity', 'logistic', 'tanh', 'relu'], #default =relu
    'solver': ['lbfgs','sgd','adam'] #default =adam
}, cv=5, return_train_score=False, scoring = 'neg_mean_squared_error')
gs_MLP.fit(x_train2,y_train2)

In [None]:
gs_MLP2 = GridSearchCV( MLPRegressor(),{
    'solver': ['lbfgs'],
    'max_fun':[15000,20000,10000,25000] #default is 15000 and only applys to lbfgs
}, cv=5, return_train_score=False,scoring = 'neg_mean_squared_error' )
gs_MLP2.fit(x_train2,y_train2)

# CLASSIFICATION

## Logistic Regression

In [None]:
# split X and y into training and test sets, test set 30% of data 

X = dfcopy.drop(columns = ['Sum'])
y = dfcopy['Sum']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=1, stratify=y)

logreg = LogisticRegression(solver = 'newton-cg')

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print(classification_report(y_test, y_pred))

classifier_score = logreg.score(X_test, y_test )
print(classifier_score)

df_reduced = dfcopy.drop(columns = ['Capital-gain', 'Capital-loss', 'Education', 'Native-country', 'Race'])
# split into X and y
X1 = df_reduced.drop(columns = ['Sum'])
y1 = df_reduced['Sum']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3,
random_state=1, stratify=y1)

logreg.fit(X1_train, y1_train)
y_pred = logreg.predict(X1_test)

print(classification_report(y_test, y_pred))

classifier_score = logreg.score(X1_test, y1_test )
print(classifier_score)

# try different argument for solver
logreg2 = LogisticRegression(solver = "liblinear")
logreg2.fit(X_train, y_train)

y_pred = logreg2.predict(X_test)
print(classification_report(y_test, y_pred))

classifier_score = logreg2.score(X_test, y_test )
print(classifier_score)

# try solver = liblinear on reduced features data 
logreg2.fit(X1_train, y1_train)
y_pred = logreg2.predict(X1_test)
print(classification_report(y1_test, y_pred))

classifier_score = logreg2.score(X1_test, y1_test )
print(classifier_score)

# model with just 6 best features selected (k=6)
clf_fs_cv = Pipeline([
 ('feature_selector', SelectKBest(f_classif, k=6)),
 ('logreg', LogisticRegression(solver='liblinear'))
])
scores = cross_val_score(clf_fs_cv, X, y, cv=5) # 5 folds.
print(scores)
avg = (100 * np.mean(scores), 100 * np.std(scores)/np.sqrt(scores.shape[0]))
print("Average score and standard deviation: (%.2f +- %.3f)%%" %avg)


In [None]:
gs_LogisticReg = GridSearchCV(LogisticRegression(),{
    'penalty': ['none','l2','l1','elasticnet'] #default = l2
}, cv=5, return_train_score=False, scoring = 'accuracy')
gs_LogisticReg.fit(X1_train,y1_train)

In [None]:
dfgs = pd.DataFrame(gs_LogisticReg.cv_results_)
dfgs[['param_penalty', 'mean_test_score']]

In [None]:
gs_LogisticReg = GridSearchCV(LogisticRegression(),{
    'solver' : ['newton-cg','lbfgs','liblinear','sag','saga'],
    'fit_intercept' : [True,False], #default = l2
}, cv=5, return_train_score=False, scoring = 'accuracy')
gs_LogisticReg.fit(X1_train,y1_train)

In [None]:
dfgs = pd.DataFrame(gs_LogisticReg.cv_results_)
dfgs[['param_fit_intercept','param_solver', 'mean_test_score']]

In [None]:
gs_LogisticReg = GridSearchCV(LogisticRegression(),{
    'solver' : ['newton-cg'],
    #default for fit_intercept = True
    'multi_class' : ['auto','ovr','multinomial']
}, cv=5, return_train_score=False, scoring = 'accuracy')
gs_LogisticReg.fit(X1_train,y1_train)

In [None]:
dfgs = pd.DataFrame(gs_LogisticReg.cv_results_)
dfgs[['param_multi_class', 'mean_test_score']]

## Support Vector Machines

In [None]:
# First try SVM with all features selected 
svclassifier = SVC(C=1.0,degree=3, gamma='auto', probability=True)
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
classifier_score = svclassifier.score(X_test, y_test)
print(f"Classifier Score is {classifier_score}")

In [None]:
svclassifier = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', probability=True)
svclassifier.fit(X1_train, y1_train)

In [None]:
y1_pred = svclassifier.predict(X1_test)

In [None]:
print(classification_report(y1_test, y1_pred))

In [None]:
## precision for 0 (<=50k) has increased however the precision for 1 (50k >) has decreased
classifier_score1 = svclassifier.score(X1_test, y1_test)
print(f"Classifier Score is {classifier_score1}")

In [None]:
print(cross_validate(SVC(),X1_train,y1_train,cv=5,scoring = 'accuracy'))


## Decision Trees

In [None]:
clf = RandomForestClassifier(random_state=1, max_depth=10)
dtt = clf.fit(X_train, y_train)
predictions = dtt.predict(X_test)
print(accuracy_score(y_test, predictions))



In [None]:
gs_RFC = GridSearchCV(RandomForestClassifier(),{
    'criterion' : ['gini','entropy'],
    'max_features' : ['sqrt','log2']
}, cv=5, return_train_score=False, scoring = 'accuracy')
gs_RFC.fit(X_train,y_train)

In [None]:
dfgs = pd.DataFrame(gs_RFC.cv_results_)
dfgs[['param_criterion','param_max_features','mean_test_score']]

## Multi-Layer Perceptron Neural Network

In [None]:
mlpc = MLPClassifier(max_iter=2500, alpha=0.00001, hidden_layer_sizes=12, random_state=3)
mlpc.fit(X_train, y_train)

In [None]:
gs_MLPC = GridSearchCV(MLPClassifier(),{
    'activation': ['identity','logistic','tanh','relu']
}, cv=5, return_train_score=False, scoring = 'accuracy')
gs_MLPC.fit(X_train,y_train)

In [None]:
dfgs = pd.DataFrame(gs_MLPC.cv_results_)
dfgs[['param_activation','mean_test_score']]

In [None]:
features = list(X)
pca = PCA(n_components=10)
Xs_pca = pca.fit_transform(X)
Xs_pca=Xs_pca[:,0:5] #retain the first 5 PC
y.head()

def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Dense(5, input_dim=5,activation="relu"))    
    
    for i in range(1, hp.Int("num_layers", 2, 6)):
        model.add(
            keras.layers.Dense(
                units=hp.Int("units_" + str(i), min_value=32, max_value=512, step=32),
                activation="relu")
            )
        
        # Tune dropout layer with values from 0 - 0.3 with stepsize of 0.1.
        model.add(keras.layers.Dropout(hp.Float("dropout_" + str(i), 0, 0.3, step=0.1)))
    
    # Add output layer.
    model.add(keras.layers.Dense(units=10, activation="softmax"))
    
    # Tune learning rate for Adam optimizer with values from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
    
    # Define optimizer, loss, and metrics
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss=keras.losses.SparseCategoricalCrossentropy(),
                  metrics=["accuracy"])
    model.summary()
    return model
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)



tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')




tuner.search(X_train, y_train, epochs=5, validation_split=0.2, callbacks=[stop_early], verbose=2)



best_hps=tuner.get_best_hyperparameters()[0]
h_model = tuner.hypermodel.build(best_hps)
h_model.fit(X_train, y_train, epochs=25, validation_split=0.2, callbacks=[stop_early], verbose=2)



h_model.evaluate(X_test, y_test, return_dict=True)