In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
origindata = pd.read_csv("wine.data", header = None)
origindata.columns = ['Class','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids',
                            'Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline']

y = origindata['Class']
X = origindata.drop(columns=['Class'])

In [3]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=1234)

# build the model

In [4]:
# multinomialNB model
from sklearn.naive_bayes import MultinomialNB
mNB = MultinomialNB()
mNB.fit(X_train, y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
y_train_mNBpre = mNB.predict(X_train)
y_test_mNBpre = mNB.predict(X_test)
print('training error of MultinomialNB model: ',mean_squared_error(y_train,y_train_mNBpre))
print('test error of MultinomialNB model: ',mean_squared_error(y_test,y_test_mNBpre))

training error of MultinomialNB model:  0.11267605633802817
test error of MultinomialNB model:  0.25


In [5]:
# GaussianNB model
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()
GNB.fit(X_train, y_train)
GaussianNB(priors=None, var_smoothing=1e-09)
y_train_GNBpre = GNB.predict(X_train)
y_test_GNBpre = GNB.predict(X_test)
print('training error of GaussianNB model: ',mean_squared_error(y_train,y_train_GNBpre))
print('test error of GaussianNB model: ',mean_squared_error(y_test,y_test_GNBpre))

training error of GaussianNB model:  0.0
test error of GaussianNB model:  0.1111111111111111


In [6]:
# GaussianNB(log) model
GNBlog = GaussianNB()
GNBlog.fit(np.log(X_train), y_train)
GaussianNB(priors=None, var_smoothing=1e-09)
y_train_GNBlogpre = GNBlog.predict(np.log(X_train))
y_test_GNBlogpre = GNBlog.predict(np.log(X_test))
print('training error of GaussianNB(log) model: ',mean_squared_error(y_train,y_train_GNBlogpre))
print('test error of GaussianNB(log) model: ',mean_squared_error(y_test,y_test_GNBlogpre))

training error of GaussianNB(log) model:  0.007042253521126761
test error of GaussianNB(log) model:  0.027777777777777776


In [7]:
#2.b.4 feature importance
#method 1
from sklearn import linear_model
regress = linear_model.Lasso(alpha=0.1)
regress.fit(X, y)
fea_import1 = pd.DataFrame(np.abs(regress.coef_))
m1 = fea_import1[0].idxmax()
print('the most useful feature: ',X.columns.values.tolist()[m1])

the most useful feature:  Flavanoids


In [8]:
#method 2
col = ['Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids',
                            'Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline']

f_m = list()
for i in col:
    inp = pd.DataFrame(X)
    inp = inp.drop(columns=[i])
    mNB.fit(inp, y)
    f_m.append(mean_squared_error(y, mNB.predict(inp)) / mean_squared_error(y_test,y_test_mNBpre))
f_m_ind = f_m.index(max(f_m))
print('The most important feature in MultinomialNB model is: ',col[f_m_ind])

The most important feature in MultinomialNB model is:  Flavanoids


In [9]:
f_g = list()
for i in col:
    inp = pd.DataFrame(X)
    inp = inp.drop(columns=[i])
    GNB.fit(inp, y)
    f_g.append(mean_squared_error(y, GNB.predict(inp)) / mean_squared_error(y_test,y_test_GNBpre))
f_g_ind = f_g.index(max(f_g))
print('The most important feature in GaussianNB model is: ',col[f_g_ind])

The most important feature in GaussianNB model is:  Proline


In [10]:
f_glog = list()
for i in col:
    inp = pd.DataFrame(X)
    inp = inp.drop(columns=[i])
    GNBlog.fit(inp, y)
    f_glog.append(mean_squared_error(y, GNBlog.predict(inp)) / mean_squared_error(y_test,y_test_GNBlogpre))
f_glog_ind = f_glog.index(max(f_glog))
print('The most important feature in GaussianNB(log) model is: ',col[f_glog_ind])

The most important feature in GaussianNB(log) model is:  Proline


# neural network model

In [11]:
# 2.c neural network and tune hyper-parameter

alpha_test = np.logspace(1e-2,10,100)
train_errors1 = list()
test_errors1 = list()

# solver = 'lbfgs'
from sklearn.neural_network import MLPClassifier
for a in alpha_test:
    neu_net1 = MLPClassifier(solver='lbfgs', alpha=a, hidden_layer_sizes=(15,), random_state=1)
    neu_net1.fit(X_train, y_train)
    MLPClassifier(activation='relu', batch_size='auto',
              beta_1=0.9, beta_2=0.9, early_stopping=False,
              epsilon=1e-08, hidden_layer_sizes=(15,),
              learning_rate='constant', learning_rate_init=0.001,
              max_iter=200, momentum=0.9, n_iter_no_change=10,
              nesterovs_momentum=True, power_t=0.5,  random_state=1,
              shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
    y_train_net = neu_net1.predict(X_train)
    y_test_net = neu_net1.predict(X_test)
    train_errors1.append(mean_squared_error(y_train,y_train_net))
    test_errors1.append(mean_squared_error(y_test,y_test_net))
min_te1 = min(test_errors1)
min_te1_ind = test_errors1.index(min(test_errors1))
alpha1 = alpha_test[min_te1_ind]

In [12]:
print(alpha1, min_te1)

17708.501728257103 0.5277777777777778


In [None]:
#solver = 'adam'
train_errors2 = list()
test_errors2 = list()
for a in alpha_test:
    neu_net2 = MLPClassifier(solver='adam', alpha=a, hidden_layer_sizes=(15,), random_state=1)
    neu_net2.fit(X_train, y_train)
    MLPClassifier(activation='relu', batch_size='auto',
              beta_1=0.9, beta_2=0.9, early_stopping=False,
              epsilon=1e-08, hidden_layer_sizes=(15,),
              learning_rate='constant', learning_rate_init=0.001,
              max_iter=200, momentum=0.9, n_iter_no_change=10,
              nesterovs_momentum=True, power_t=0.5,  random_state=1,
              shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
    y_train_net = neu_net2.predict(X_train)
    y_test_net = neu_net2.predict(X_test)
    train_errors2.append(mean_squared_error(y_train,y_train_net))
    test_errors2.append(mean_squared_error(y_test,y_test_net))
min_te2 = min(test_errors2)
min_te2_ind = test_errors2.index(min(test_errors2))
alpha2 = alpha_test[min_te2_ind]

In [14]:
print(alpha2, min_te2)

228129.69412584268 1.25


In [None]:
#solver = 'sgd'
train_errors3 = list()
test_errors3 = list()
for a in alpha_test:
    neu_net3 = MLPClassifier(solver='sgd', alpha=a, hidden_layer_sizes=(15,), random_state=1)
    neu_net3.fit(X_train, y_train)
    MLPClassifier(activation='relu', batch_size='auto',
              beta_1=0.9, beta_2=0.9, early_stopping=False,
              epsilon=1e-08, hidden_layer_sizes=(15,),
              learning_rate='constant', learning_rate_init=0.001,
              max_iter=200, momentum=0.9, n_iter_no_change=10,
              nesterovs_momentum=True, power_t=0.5,  random_state=1,
              shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
    y_train_net = neu_net3.predict(X_train)
    y_test_net = neu_net3.predict(X_test)
    train_errors3.append(mean_squared_error(y_train,y_train_net))
    test_errors3.append(mean_squared_error(y_test,y_test_net))
min_te3 = min(test_errors3)
min_te3_ind = test_errors3.index(min(test_errors3))
alpha3 = alpha_test[min_te3_ind]

In [16]:
print(alpha3, min_te3)

1.023292992280754 1.4722222222222223


In [17]:
# errors with the best alpha and solver
neu_net = MLPClassifier(solver='lbfgs', alpha=alpha1, hidden_layer_sizes=(15,), random_state=1)
neu_net.fit(X_train, y_train)
MLPClassifier(activation='relu', batch_size='auto',
              beta_1=0.9, beta_2=0.9, early_stopping=False,
              epsilon=1e-08, hidden_layer_sizes=(15,),
              learning_rate='constant', learning_rate_init=0.001,
              max_iter=200, momentum=0.9, n_iter_no_change=10,
              nesterovs_momentum=True, power_t=0.5,  random_state=1,
              shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
y_train_net = neu_net.predict(X_train)
y_test_net = neu_net.predict(X_test)
print('training error of neural network model: ',mean_squared_error(y_train,y_train_net))
print('test error of neural network model: ',mean_squared_error(y_test,y_test_net))

training error of neural network model:  0.6197183098591549
test error of neural network model:  0.5277777777777778


In [18]:
# feature importance
f_n = list()
for i in col:
    inp = pd.DataFrame(X)
    inp = inp.drop(columns=[i])
    neu_net.fit(inp, y)
    f_n.append(mean_squared_error(y, neu_net.predict(inp)) / mean_squared_error(y_test,y_test_net))
f_n_ind = f_n.index(max(f_n))
print('The most important feature in neural network model is: ',col[f_n_ind])

The most important feature in neural network model is:  OD280/OD315 of diluted wines
