## For  20NG TF as an autoencoder with a desired hidden layer size (try K=20, 100, 200). Verify the obtained reencoding of data (the new feature representation) in several ways: 

In [1]:
import numpy as np
from sklearn import datasets
import mnist 
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree
import pandas as pd
from sklearn.model_selection import train_test_split

# Loading the input data

In [2]:
#training data 

path = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

data = pd.read_csv(path, header = None)

#check out the summary 
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


In [3]:
#splitting 20% data into test and 80 % into train 
X_train, X_test, Y_train, Y_test = train_test_split(data.iloc[:,0:57], data.iloc[:,57], test_size=0.2, random_state=42)

In [4]:
X_train.shape, Y_train.shape, X_test.shape,Y_test.shape

((3680, 57), (3680,), (921, 57), (921,))

## Bulding Model  

In [5]:
#Building the Decision Tree Model
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,Y_train)

#Accuracy on test data
print("Train Accuracy using Decision Trees:",clf.score(X_train,Y_train))

#Accuracy on test data
print("Test Accuracy using Decision Trees:",clf.score(X_test,Y_test))

Train Accuracy using Decision Trees: 0.9994565217391305
Test Accuracy using Decision Trees: 0.9120521172638436


## Input & Output Placeholders

## K = 20

In [6]:
inputs = 57
hidden_layer1_units = 20
output_layer_units = inputs

In [7]:
X = tf.placeholder(tf.float32, shape = [None, inputs]) #flattened shape

### Softmax Activation
$\Pr{(Y=j | X)}  = \frac{e^{(X.W_j^T + B_j)}}{\sum\limits_{i = 1}^n {e^{(X.W_i^T + B_i)}}} $

In [8]:
tf.set_random_seed(42)
W = {
    'encoder_h1': tf.Variable(tf.random_normal([inputs, hidden_layer1_units])),
    'decoder_h1':tf.Variable(tf.random_normal([hidden_layer1_units, output_layer_units]))
}

B = {
    'encoder_h1':tf.Variable(tf.random_normal([hidden_layer1_units])),
    'decoder_h1':tf.Variable(tf.random_normal([output_layer_units]))
}

In [9]:
encoder = tf.matmul(X,W['encoder_h1']) +  B['encoder_h1']
encoder = tf.sigmoid(encoder) # using softmax activation
decoder = tf.matmul(encoder,W['decoder_h1'])  + B['decoder_h1']
decoder = tf.nn.softmax(decoder) # using softmax activation

## Cost function: Cross Entropy
## Optimizer : Adam 

In [10]:
epochs = 10
batch_size = 100
learning_rate = 0.01

In [11]:
Y = X
loss = tf.reduce_mean(tf.pow(decoder - Y, 2))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

## Evaluating

In [12]:
def get_batches(batch_size,x):
        indexes = list(range(x.shape[0]))
        random.shuffle(indexes)
        ind = indexes[:batch_size]
        return(x[ind])

In [13]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(epochs):
        train_c = 0                                                       #cost
        total_batch = int(X_train.shape[0]/batch_size)
        for i in range(total_batch):
            batch_x = get_batches(batch_size, np.array(X_train))
            _,train_c = sess.run([optimizer, loss], feed_dict = {X: batch_x})
            train_c += train_c/batch_size    
        if epoch % 5 == 0:
            print("Epoch:",epoch+1,"\t Average Train Eror:",train_c)  
            test_c = sess.run(loss,feed_dict = {X:np.array(X_test)})
            print("\n Test Error", test_c)    
    encoder_train, decoder_train = sess.run([encoder,decoder],feed_dict = {X:np.array(X_train)})
    encoder_test, decoder_test = sess.run([encoder,decoder],feed_dict = {X:np.array(X_test)})       

Epoch: 1 	 Average Train Eror: 6208.04390625

 Test Error 11645.852
Epoch: 6 	 Average Train Eror: 5480.756123046875

 Test Error 11645.351


In [14]:
 encoder_train.shape, decoder_train.shape, encoder_test.shape, decoder_test.shape

((3680, 20), (3680, 57), (921, 20), (921, 57))

## Building Model with reduced dimensions k = 20

In [15]:
#Building the Decision Tree Model
clf = tree.DecisionTreeClassifier()
clf.fit(encoder_train,Y_train)

#Accuracy on test data
print("Train Accuracy using Decision Trees:",clf.score(encoder_train,Y_train))

#Accuracy on test data
print("Test Accuracy using Decision Trees:",clf.score(encoder_test,Y_test))

Train Accuracy using Decision Trees: 0.8195652173913044
Test Accuracy using Decision Trees: 0.7014115092290988


## k = 30

In [16]:
inputs = 57
hidden_layer1_units = 30
output_layer_units = inputs
epochs = 10
batch_size = 100
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape = [None, inputs]) #flattened shape

tf.set_random_seed(42)
W = {
    'encoder_h1': tf.Variable(tf.random_normal([inputs, hidden_layer1_units])),
    'decoder_h1':tf.Variable(tf.random_normal([hidden_layer1_units, output_layer_units]))
}

B = {
    'encoder_h1':tf.Variable(tf.random_normal([hidden_layer1_units])),
    'decoder_h1':tf.Variable(tf.random_normal([output_layer_units]))
}


encoder = tf.matmul(X,W['encoder_h1']) +  B['encoder_h1']
encoder = tf.sigmoid(encoder) # using softmax activation
decoder = tf.matmul(encoder,W['decoder_h1'])  + B['decoder_h1']
decoder = tf.nn.softmax(decoder) # using softmax activation

Y = X
loss = tf.reduce_mean(tf.pow(decoder - Y, 2))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

def get_batches(batch_size,x):
        indexes = list(range(x.shape[0]))
        random.shuffle(indexes)
        ind = indexes[:batch_size]
        return(x[ind])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(epochs):
        train_c = 0                                                       #cost
        total_batch = int(X_train.shape[0]/batch_size)
        for i in range(total_batch):
            batch_x = get_batches(batch_size, np.array(X_train))
            _,train_c = sess.run([optimizer, loss], feed_dict = {X: batch_x})
            train_c += train_c/batch_size    
        if epoch % 5 == 0:
            print("Epoch:",epoch+1,"\t Average Train Eror:",train_c)  
            test_c = sess.run(loss,feed_dict = {X:np.array(X_test)})
            print("\n Test Error", test_c)    
    encoder_train_30, decoder_train_30 = sess.run([encoder,decoder],feed_dict = {X:np.array(X_train)})
    encoder_test_30, decoder_test_30 = sess.run([encoder,decoder],feed_dict = {X:np.array(X_test)})    

Epoch: 1 	 Average Train Eror: 2875.4818359375

 Test Error 11654.148
Epoch: 6 	 Average Train Eror: 4919.331743164063

 Test Error 11645.346


In [17]:
encoder_train_30.shape, decoder_train_30.shape,encoder_test_30.shape, decoder_test_30.shape

((3680, 30), (3680, 57), (921, 30), (921, 57))

## Building Model with reduced dimensions k = 30

In [18]:
#Building the Decision Tree Model
clf = tree.DecisionTreeClassifier()
clf.fit(encoder_train_30,Y_train)

#Accuracy on test data
print("Train Accuracy using Decision Trees:",clf.score(encoder_train_30,Y_train))

#Accuracy on test data
print("Test Accuracy using Decision Trees:",clf.score(encoder_test_30,Y_test))

Train Accuracy using Decision Trees: 0.9695652173913043
Test Accuracy using Decision Trees: 0.7654723127035831


## K = 40 

In [19]:
inputs = 57
hidden_layer1_units = 40
output_layer_units = inputs
epochs = 10
batch_size = 100
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape = [None, inputs]) #flattened shape

tf.set_random_seed(42)
W = {
    'encoder_h1': tf.Variable(tf.random_normal([inputs, hidden_layer1_units])),
    'decoder_h1':tf.Variable(tf.random_normal([hidden_layer1_units, output_layer_units]))
}

B = {
    'encoder_h1':tf.Variable(tf.random_normal([hidden_layer1_units])),
    'decoder_h1':tf.Variable(tf.random_normal([output_layer_units]))
}


encoder = tf.matmul(X,W['encoder_h1']) +  B['encoder_h1']
encoder = tf.sigmoid(encoder) # using softmax activation
decoder = tf.matmul(encoder,W['decoder_h1'])  + B['decoder_h1']
decoder = tf.nn.softmax(decoder) # using softmax activation

Y = X
loss = tf.reduce_mean(tf.pow(decoder - Y, 2))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

def get_batches(batch_size,x):
        indexes = list(range(x.shape[0]))
        random.shuffle(indexes)
        ind = indexes[:batch_size]
        return(x[ind])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(epochs):
        train_c = 0                                                       #cost
        total_batch = int(X_train.shape[0]/batch_size)
        for i in range(total_batch):
            batch_x = get_batches(batch_size, np.array(X_train))
            _,train_c = sess.run([optimizer, loss], feed_dict = {X: batch_x})
            train_c += train_c/batch_size    
        if epoch % 5 == 0:
            print("Epoch:",epoch+1,"\t Average Train Eror:",train_c)  
            test_c = sess.run(loss,feed_dict = {X:np.array(X_test)})
            print("\n Test Error", test_c)    
    encoder_train_40, decoder_train_40 = sess.run([encoder,decoder],feed_dict = {X:np.array(X_train)})
    encoder_test_40, decoder_test_40 = sess.run([encoder,decoder],feed_dict = {X:np.array(X_test)})    

Epoch: 1 	 Average Train Eror: 19716.76826171875

 Test Error 11654.139
Epoch: 6 	 Average Train Eror: 32150.761875

 Test Error 11645.333


In [20]:
encoder_train_40.shape, decoder_train_40.shape

((3680, 40), (3680, 57))

## Building Model with reduced dimensions k = 40

In [21]:
#Building the Decision Tree Model
clf = tree.DecisionTreeClassifier()
clf.fit(encoder_train_40,Y_train)

#Accuracy on test data
print("Train Accuracy using Decision Trees:",clf.score(encoder_train_40,Y_train))

#Accuracy on test data
print("Test Accuracy using Decision Trees:",clf.score(encoder_test_40,Y_test))

Train Accuracy using Decision Trees: 0.9741847826086957
Test Accuracy using Decision Trees: 0.7785016286644951
