# Cluster Analysis, ANN, & Text Mining: Part 3 - ANN
### Author: Michael Berbach, Isaiah Samaniego

In [115]:
import pandas as pd
from keras import Sequential
from keras.layers import Dense
import numpy as np
from sklearn import preprocessing as prep
from sklearn.preprocessing import OneHotEncoder
import tensorflow as ff
from sklearn.metrics import accuracy_score, precision_score, classification_report, f1_score



We import and parse the Admissions dataset, and then we display it.

In [116]:
# loading data set from assignment 2
df = pd.read_csv('data\Admission_Predict_Ver1.1_small_data_set_for_Linear_Regression.csv', sep=r'\s*,\s*')
df = df.drop(columns=['Serial No.'])
print(df)


     GRE Score  TOEFL Score  University Rating  SOP  LOR  CGPA  Research  \
0          337          118                  4  4.5  4.5  9.65         1   
1          324          107                  4  4.0  4.5  8.87         1   
2          316          104                  3  3.0  3.5  8.00         1   
3          322          110                  3  3.5  2.5  8.67         1   
4          314          103                  2  2.0  3.0  8.21         0   
..         ...          ...                ...  ...  ...   ...       ...   
495        332          108                  5  4.5  4.0  9.02         1   
496        337          117                  5  5.0  5.0  9.87         1   
497        330          120                  5  4.5  5.0  9.56         1   
498        312          103                  4  4.0  5.0  8.43         0   
499        327          113                  4  4.5  4.5  9.04         0   

     Chance of Admit  
0               0.92  
1               0.76  
2               0.

  df = pd.read_csv('data\Admission_Predict_Ver1.1_small_data_set_for_Linear_Regression.csv', sep=r'\s*,\s*')


## Preprocessing ANN: Normalize Predictors and Binarize Targets
Using the min-max approach, we rescale to the range of [0,1]. This is done by subtracting the minimum value and dividing by the range.

In [117]:
df['GRE Score'] = ((df['GRE Score'] - df['GRE Score'].min()) / (df['GRE Score'].max() - df['GRE Score'].min())).astype(np.float32)
df['TOEFL Score'] = ((df['TOEFL Score'] - df['TOEFL Score'].min()) / (df['TOEFL Score'].max() - df['TOEFL Score'].min())).astype(np.float32)
df['University Rating'] = ((df['University Rating'] - df['University Rating'].min()) / (df['University Rating'].max() - df['University Rating'].min())).astype(np.float32)
df['SOP'] = ((df['SOP'] - df['SOP'].min()) / (df['SOP'].max() - df['SOP'].min())).astype(np.float32)
df['CGPA'] = ((df['CGPA'] - df['CGPA'].min()) / (df['CGPA'].max() - df['CGPA'].min())).astype(np.float32)
df['Research'] = ((df['Research'] - df['Research'].min()) / (df['Research'].max() - df['SOP'].min())).astype(int)
df['LOR'] = ((df['LOR'] - df['LOR'].min()) / (df['LOR'].max() - df['LOR'].min())).astype(np.float32)
df['Chance of Admit'] = (df['Chance of Admit'] > df['Chance of Admit'].median()).astype('int')
print(df)


     GRE Score  TOEFL Score  University Rating    SOP    LOR      CGPA  \
0         0.94     0.928571               0.75  0.875  0.875  0.913462   
1         0.68     0.535714               0.75  0.750  0.875  0.663462   
2         0.52     0.428571               0.50  0.500  0.625  0.384615   
3         0.64     0.642857               0.50  0.625  0.375  0.599359   
4         0.48     0.392857               0.25  0.250  0.500  0.451923   
..         ...          ...                ...    ...    ...       ...   
495       0.84     0.571429               1.00  0.875  0.750  0.711538   
496       0.94     0.892857               1.00  1.000  1.000  0.983974   
497       0.80     1.000000               1.00  0.875  1.000  0.884615   
498       0.44     0.392857               0.75  0.750  1.000  0.522436   
499       0.74     0.750000               0.75  0.875  0.875  0.717949   

     Research  Chance of Admit  
0           1                1  
1           1                1  
2           

We will now create a test set that will take 100 rows from our dataset.

In [118]:
index = 400
test = df[index:]
train = df[:index]
test.head(20)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
400,0.28,0.285714,0.25,0.625,0.5,0.455128,0,0
401,0.5,0.464286,0.25,0.5,0.5,0.49359,0,0
402,0.68,0.607143,0.5,0.625,0.5,0.685897,1,1
403,0.8,0.857143,0.75,0.75,0.625,0.778846,1,1
404,0.42,0.321429,0.5,0.25,0.375,0.269231,1,0
405,0.24,0.25,0.5,0.375,0.5,0.208333,0,0
406,0.64,0.392857,0.75,0.5,0.375,0.391026,1,0
407,0.16,0.285714,0.5,0.375,0.75,0.36859,1,0
408,0.14,0.321429,0.5,0.25,0.75,0.278846,1,0
409,0.2,0.214286,0.0,0.25,0.375,0.391026,0,0


In [119]:
import collections
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.abc.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)
    
X, Y = to_xy(train,'Chance of Admit')
testX, testY = to_xy(test, 'Chance of Admit')

In [120]:
print(X.shape)
print(Y.shape)
Y

(400, 7)
(400, 2)


array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.

In [121]:
model = ff.keras.Sequential()
model.add(Dense(12, input_dim = X.shape[1], activation='relu'))
model.add(Dense(6, activation='relu'))
model.add(Dense(2, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [122]:
model.compile(loss='categorical_crossentropy',optimizer='adam')
model.fit(X,Y,verbose=2, epochs=100)

Epoch 1/100
13/13 - 1s - 51ms/step - loss: 0.7166
Epoch 2/100
13/13 - 0s - 2ms/step - loss: 0.6957
Epoch 3/100
13/13 - 0s - 2ms/step - loss: 0.6803
Epoch 4/100
13/13 - 0s - 2ms/step - loss: 0.6706
Epoch 5/100
13/13 - 0s - 2ms/step - loss: 0.6626
Epoch 6/100
13/13 - 0s - 2ms/step - loss: 0.6527
Epoch 7/100
13/13 - 0s - 2ms/step - loss: 0.6411
Epoch 8/100
13/13 - 0s - 2ms/step - loss: 0.6279
Epoch 9/100
13/13 - 0s - 2ms/step - loss: 0.6133
Epoch 10/100
13/13 - 0s - 2ms/step - loss: 0.5987
Epoch 11/100
13/13 - 0s - 2ms/step - loss: 0.5829
Epoch 12/100
13/13 - 0s - 2ms/step - loss: 0.5659
Epoch 13/100
13/13 - 0s - 2ms/step - loss: 0.5480
Epoch 14/100
13/13 - 0s - 2ms/step - loss: 0.5315
Epoch 15/100
13/13 - 0s - 2ms/step - loss: 0.5146
Epoch 16/100
13/13 - 0s - 2ms/step - loss: 0.4997
Epoch 17/100
13/13 - 0s - 2ms/step - loss: 0.4850
Epoch 18/100
13/13 - 0s - 2ms/step - loss: 0.4715
Epoch 19/100
13/13 - 0s - 2ms/step - loss: 0.4595
Epoch 20/100
13/13 - 0s - 2ms/step - loss: 0.4480
Epoch 21

<keras.src.callbacks.history.History at 0x2003f151950>

In [123]:
pred = model.predict(testX)
print(pred[0])

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[0.9360205  0.06397951]


In [124]:
pred = np.argmax(pred, axis=1)

In [125]:
true = np.argmax(testY, axis=1)

In [126]:
print("Predicted: ", [pred])
print("True: ", [true])

Predicted:  [array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], dtype=int64)]
True:  [array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], dtype=int64)]


We generate accuracy from the ANN and classification report, we can see that our predicted values are above average.

In [127]:
print('Accuracy on test data is %.2f' % (accuracy_score(true, pred)))

Accuracy on test data is 0.90


In [128]:
print(classification_report(true,pred))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        56
           1       0.89      0.89      0.89        44

    accuracy                           0.90       100
   macro avg       0.90      0.90      0.90       100
weighted avg       0.90      0.90      0.90       100

