In [1]:
import pandas as pd
import numpy as np

In [2]:
data_url = "http://www.ats.ucla.edu/stat/data/hsbdemo.dta"
df = pd.read_stata(data_url)
df[:5] 

Unnamed: 0,id,female,ses,schtyp,prog,read,write,math,science,socst,honors,awards,cid
0,45.0,female,low,public,vocation,34.0,35.0,41.0,29.0,26.0,not enrolled,0.0,1
1,108.0,male,middle,public,general,34.0,33.0,41.0,36.0,36.0,not enrolled,0.0,1
2,15.0,male,high,public,vocation,39.0,39.0,44.0,26.0,42.0,not enrolled,0.0,1
3,67.0,male,low,public,vocation,37.0,37.0,42.0,33.0,32.0,not enrolled,0.0,1
4,153.0,male,middle,public,vocation,39.0,31.0,40.0,39.0,51.0,not enrolled,0.0,1


In [3]:
df["awards"].unique()

array([ 0.,  1.,  2.,  3.,  5.,  4.,  7.])

In [4]:
df.pop("id")
df.pop("cid")
df[:5]

Unnamed: 0,female,ses,schtyp,prog,read,write,math,science,socst,honors,awards
0,female,low,public,vocation,34.0,35.0,41.0,29.0,26.0,not enrolled,0.0
1,male,middle,public,general,34.0,33.0,41.0,36.0,36.0,not enrolled,0.0
2,male,high,public,vocation,39.0,39.0,44.0,26.0,42.0,not enrolled,0.0
3,male,low,public,vocation,37.0,37.0,42.0,33.0,32.0,not enrolled,0.0
4,male,middle,public,vocation,39.0,31.0,40.0,39.0,51.0,not enrolled,0.0


In [5]:
df.dtypes

female     category
ses        category
schtyp     category
prog       category
read        float32
write       float32
math        float32
science     float32
socst       float32
honors     category
awards      float32
dtype: object

In [6]:
y_data = np.reshape(df.pop("prog").values, (-1.1))
y_data.shape

(200,)

In [7]:
x_data = pd.get_dummies(df).values
x_data[:5]

array([[ 34.,  35.,  41.,  29.,  26.,   0.,   0.,   1.,   1.,   0.,   0.,
          1.,   0.,   1.,   0.],
       [ 34.,  33.,  41.,  36.,  36.,   0.,   1.,   0.,   0.,   1.,   0.,
          1.,   0.,   1.,   0.],
       [ 39.,  39.,  44.,  26.,  42.,   0.,   1.,   0.,   0.,   0.,   1.,
          1.,   0.,   1.,   0.],
       [ 37.,  37.,  42.,  33.,  32.,   0.,   1.,   0.,   1.,   0.,   0.,
          1.,   0.,   1.,   0.],
       [ 39.,  31.,  40.,  39.,  51.,   0.,   1.,   0.,   0.,   1.,   0.,
          1.,   0.,   1.,   0.]])

In [8]:
from sklearn import preprocessing # Min-Max Standardzation

min_max_scaler = preprocessing.MinMaxScaler()
x_data[:, :6] = min_max_scaler.fit_transform(x_data[:, :6])

x_data[:5,:6]

array([[ 0.125     ,  0.11111111,  0.19047619,  0.0625    ,  0.        ,
         0.        ],
       [ 0.125     ,  0.05555556,  0.19047619,  0.20833333,  0.22222222,
         0.        ],
       [ 0.22916667,  0.22222222,  0.26190476,  0.        ,  0.35555556,
         0.        ],
       [ 0.1875    ,  0.16666667,  0.21428571,  0.14583333,  0.13333333,
         0.        ],
       [ 0.22916667,  0.        ,  0.16666667,  0.27083333,  0.55555556,
         0.        ]])

In [9]:
import numpy as np 

training_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.8))
test_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.2))

x_training, x_test = x_data[training_idx,:], x_data[test_idx,:]
y_training, y_test = y_data[training_idx], y_data[test_idx]

x_training.shape, x_test.shape

((160, 15), (40, 15))

In [10]:
from sklearn import linear_model, datasets

logreg = linear_model.LogisticRegression(multi_class='multinomial', fit_intercept=True, solver="lbfgs")
logreg.fit(x_training, y_training)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
sum(logreg.predict(x_test) == y_test)  / y_test.shape[0] 

0.65000000000000002

In [12]:
data_url = "http://www.ats.ucla.edu/stat/data/hsbdemo.dta"
df = pd.read_stata(data_url)
df.pop("id")
df.pop("cid")
df[:5]

Unnamed: 0,female,ses,schtyp,prog,read,write,math,science,socst,honors,awards
0,female,low,public,vocation,34.0,35.0,41.0,29.0,26.0,not enrolled,0.0
1,male,middle,public,general,34.0,33.0,41.0,36.0,36.0,not enrolled,0.0
2,male,high,public,vocation,39.0,39.0,44.0,26.0,42.0,not enrolled,0.0
3,male,low,public,vocation,37.0,37.0,42.0,33.0,32.0,not enrolled,0.0
4,male,middle,public,vocation,39.0,31.0,40.0,39.0,51.0,not enrolled,0.0


In [13]:
y_data = pd.get_dummies(df.ix[:,"prog"]).as_matrix()
y_data[:5,:]

array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.]])

In [14]:
x_data_df = df.drop(["prog"],axis=1)
x_data_df.insert(0, "bias", 1)
x_data_df.ix[:5,:]

Unnamed: 0,bias,female,ses,schtyp,read,write,math,science,socst,honors,awards
0,1,female,low,public,34.0,35.0,41.0,29.0,26.0,not enrolled,0.0
1,1,male,middle,public,34.0,33.0,41.0,36.0,36.0,not enrolled,0.0
2,1,male,high,public,39.0,39.0,44.0,26.0,42.0,not enrolled,0.0
3,1,male,low,public,37.0,37.0,42.0,33.0,32.0,not enrolled,0.0
4,1,male,middle,public,39.0,31.0,40.0,39.0,51.0,not enrolled,0.0
5,1,female,high,public,42.0,36.0,42.0,31.0,39.0,not enrolled,0.0


In [15]:
x_data = pd.get_dummies(x_data_df).as_matrix()
x_data[:5]

array([[  1.,  34.,  35.,  41.,  29.,  26.,   0.,   0.,   1.,   1.,   0.,
          0.,   1.,   0.,   1.,   0.],
       [  1.,  34.,  33.,  41.,  36.,  36.,   0.,   1.,   0.,   0.,   1.,
          0.,   1.,   0.,   1.,   0.],
       [  1.,  39.,  39.,  44.,  26.,  42.,   0.,   1.,   0.,   0.,   0.,
          1.,   1.,   0.,   1.,   0.],
       [  1.,  37.,  37.,  42.,  33.,  32.,   0.,   1.,   0.,   1.,   0.,
          0.,   1.,   0.,   1.,   0.],
       [  1.,  39.,  31.,  40.,  39.,  51.,   0.,   1.,   0.,   0.,   1.,
          0.,   1.,   0.,   1.,   0.]])

In [16]:
from sklearn import preprocessing # Min-Max Standardzation

min_max_scaler = preprocessing.MinMaxScaler()
x_data[:, 1:7] = min_max_scaler.fit_transform(x_data[:, 1:7])

x_data[:5,:7]

array([[ 1.        ,  0.125     ,  0.11111111,  0.19047619,  0.0625    ,
         0.        ,  0.        ],
       [ 1.        ,  0.125     ,  0.05555556,  0.19047619,  0.20833333,
         0.22222222,  0.        ],
       [ 1.        ,  0.22916667,  0.22222222,  0.26190476,  0.        ,
         0.35555556,  0.        ],
       [ 1.        ,  0.1875    ,  0.16666667,  0.21428571,  0.14583333,
         0.13333333,  0.        ],
       [ 1.        ,  0.22916667,  0.        ,  0.16666667,  0.27083333,
         0.55555556,  0.        ]])

In [17]:
training_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.8))
test_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.2))

x_training, x_test = x_data[training_idx,:], x_data[test_idx,:]
y_training, y_test = y_data[training_idx,:], y_data[test_idx,:]

x_training.shape, x_test.shape

((160, 16), (40, 16))

In [18]:
import tensorflow as tf

X = tf.placeholder("float", [None, len(x_data[0])])
Y = tf.placeholder("float", [None, len(y_data[0])])

W = tf.Variable(tf.zeros([len(x_data[0]), len(y_data[0])]))

hypothesis = tf.nn.softmax(tf.matmul(X,W))
learning_rate = 0.001

cost = tf.reduce_mean(-1 * tf.reduce_sum(Y * tf.log(hypothesis), reduction_indices=1))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

init = tf.initialize_all_variables()

sess = tf.Session()
sess.run(init)
    
for i in range(400001):
    sess.run(optimizer, feed_dict={X: x_training, Y:y_training})
    if i % 40000 == 0:
        print ("%d 's iteration" % i)
        print (sess.run(cost, feed_dict={X: x_training, Y:y_training}))
finale_theta = sess.run(W)


0 's iteration
1.09828
40000 's iteration
0.820443
80000 's iteration
0.793121
120000 's iteration
0.780265
160000 's iteration
0.772745
200000 's iteration
0.767853
240000 's iteration
0.76446
280000 's iteration
0.761999
320000 's iteration
0.760157
360000 's iteration
0.758743
400000 's iteration
0.757631


In [19]:
hypotehsis_value = sess.run(hypothesis, feed_dict={X:x_test})
result= [np.argmax(predict) == np.argmax(original_value)  for predict, original_value in zip(hypotehsis_value, y_test)]
sum(result) / len(result)

0.82499999999999996