In [1]:
import numpy as np
import tensorflow as tf
from sklearn import linear_model, datasets
from sklearn.preprocessing import normalize
import urllib.request as url
import pandas

In [2]:
# https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.names

data_source = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
names = ['class', 'alcohol', 'malic acid','ash','alcalinity of ash','magnesium','total phenols', 'flavanoids','nonflavanoid phenols', 'proanthocyanins', 'color intensity', 'hue', 'diluted wines', 'proline']

df = pandas.read_csv(data_source, names=names)
df[:5]

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,diluted wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
# class만 따로 y_data에 할당

y_data = df["class"].values.reshape(-1,1)

In [4]:
# y_data를 제외한 모든 변수를 normalize

df_norm = (df.ix[:,1:] - df.ix[:,1:].mean()) / (df.ix[:,1:].max() - df.ix[:,1:].min())
df_norm[:5]

Unnamed: 0,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,diluted wines,proline
0,0.323522,-0.123784,0.033948,-0.20077,0.296287,0.174099,0.217454,-0.154441,0.220537,0.049651,0.067114,0.479236,0.226895
1,0.052469,-0.10995,-0.121132,-0.427574,0.002809,0.122375,0.154163,-0.192177,-0.098075,-0.057857,0.075244,0.28876,0.216196
2,0.041943,0.004674,0.16229,-0.046131,0.013679,0.174099,0.255428,-0.116706,0.384574,0.053064,0.058984,0.204511,0.312487
3,0.360364,-0.076353,0.071381,-0.138915,0.144113,0.536168,0.308171,-0.229913,0.185836,0.233951,-0.079227,0.307075,0.522901
4,0.062995,0.050129,0.269242,0.07758,0.198461,0.174099,0.139395,0.053106,0.072272,-0.062977,0.067114,0.116599,-0.008483


In [5]:
# 정규화가 된 데이터를 x_data에 할당

x_data = df_norm[:].values
x_data

array([[ 0.32352158, -0.12378425,  0.03394821, ...,  0.06711428,
         0.47923612,  0.22689497],
       [ 0.05246895, -0.10995026, -0.12113201, ...,  0.07524436,
         0.28875993,  0.21619596],
       [ 0.04194264,  0.00467425,  0.16229045, ...,  0.0589842 ,
         0.20451084,  0.31248698],
       ..., 
       [ 0.07089001,  0.38412089, -0.05696088, ..., -0.29873938,
        -0.38523274,  0.06284361],
       [ 0.04457422,  0.05012879,  0.00186264, ..., -0.2906093 ,
        -0.36325472,  0.06640994],
       [ 0.2972058 ,  0.34854776,  0.19972361, ..., -0.28247922,
        -0.37058073, -0.13330475]])

In [6]:
# 데이터를 랜덤하게 표본으로 추출함

training_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.8))
test_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.2))

x_training, x_test = x_data[training_idx,:], x_data[test_idx,:]
y_training, y_test = y_data[training_idx,:], y_data[test_idx,:]

In [7]:
X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)

W = tf.Variable(tf.random_uniform([len(x_data[0]), 1], -1.0, -1.0))

In [8]:
h = tf.matmul(X, W)
hypothesis = tf.div(1., 1. + tf.exp(-h))
cost = -1 * tf.reduce_mean(Y * tf.log(hypothesis) + (1-Y) * tf.log(1-hypothesis))

a = tf.Variable(0.1)
optimizer = tf.train.GradientDescentOptimizer(a)
train = optimizer.minimize(cost)

init = tf.initialize_all_variables()

In [9]:
sess = tf.Session()
sess.run(init)


# cost_history = []

for i in range(1000):
    sess.run(train, feed_dict={X: x_training, Y:y_training})
    if i % 50 == 0:
        print (i, sess.run(cost, feed_dict={X: x_training, Y:y_training}))

0 0.419245
50 0.101851
100 -0.126435
150 -0.306444
200 -0.458633
250 -0.593634
300 -0.717373
350 -0.833375
400 -0.943847
450 -1.05024
500 -1.15355
550 -1.25446
600 -1.35348
650 -1.45099
700 -1.54724
750 -1.64248
800 -1.73683
850 -1.83062
900 -1.92349
950 -2.01601


In [10]:
from sklearn import linear_model, datasets

logreg = linear_model.LogisticRegression(fit_intercept=False)
logreg.fit(x_training, y_training.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
sum(logreg.predict(x_test).reshape(-1,1) == y_test)  / y_test.shape[0] 

array([ 0.91428571])

In [12]:
prediction_result = []
for _ in range(100):
    training_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.8))
    test_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.2))

    x_training, x_test = x_data[training_idx,:], x_data[test_idx,:]
    y_training, y_test = y_data[training_idx,:], y_data[test_idx,:]
    
    logreg.fit(x_training, y_training.ravel())
    prediction_result.append(sum(logreg.predict(x_test) == y_test.ravel())  / y_test.shape[0])
    
np.mean(prediction_result)

0.96857142857142842

In [13]:
max, min = np.mean(prediction_result) + 1.96 * (np.std(prediction_result) /10), \
            np.mean(prediction_result) - 1.96 * (np.std(prediction_result) /10)
min, max 

(0.96267146246983093, 0.9744713946730259)