In [146]:
#we have accomodations to extend this assignment

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import time

In [147]:
#declaring norm function with the 2d array dataset
def norm(dataset):
    #looping though each column index except the last colums
    for column_index in range(dataset.shape[1] - 1):
        
        #determining min and max values from the current columns to scale the data between [0,1]
        max_val = dataset[column_index].max()
        min_val = dataset[column_index].min()

        #normalizing the column to fall within the range [0,1]
        #this will modify the dataset in place allowing the original values to be replaced by the normalized values
        dataset[column_index] = (dataset[column_index] - min_val) / (max_val - min_val)

In [148]:
#predict function with 2 parmeters weight vector (w) and input vector (x)
def predict(w, x):
    #computing the dot product between weight and input vectors
    scaled_x = np.dot(w, x)
    if scaled_x > 0:
        return 1
    else:
        return 0

# get accuracy, confusion matrices, rate and optimized weight vector

#dataset-data the model is trained on
#split-fraction of the dataset that is being used for training
#error-the fucntion tha will keep updating the weights until the error becomes less than this threshold
#alpha-the learning rate for the perceptron training
def train(dataset, split, error_threshold, alpha):
        
    #splitting into training and testing data
    group0 = dataset[dataset.iloc[:, 2] == 0]
    group1 = dataset[dataset.iloc[:, 2] == 1]
    sample_size = int(min(len(group0), len(group1)) * split)
    sampled_subset0 = group0.sample(n = sample_size, random_state = 42)
    sampled_subset1 = group1.sample(n = sample_size, random_state = 42)
    training = pd.concat([sampled_subset0, sampled_subset1])
    testing = dataset.drop(training.index)   
    
    
    
    #initialization stuff
    limit = 5000
    patterns = training.shape[0]
    w = [random.uniform(-0.5, 0.5) for _ in range(training.shape[1])]
    
    
    
    #training 
    i = 0
    error = patterns
    #while i is less than 5000 or it is not accurate enough
    while i < limit and error > error_threshold:
        
        for row in range(patterns):
            x = training.iloc[row].values
            scaled_x = predict(w, x)
                
            delta_weight = alpha * (x[2] - scaled_x)
            delta_weighted_x = x * delta_weight
            w = np.array(w) + np.array(delta_weighted_x)
            
            
        #make sure w is correct
        #the error in this iteration of the neuron
        error = 0
        for row in range(patterns):
            x = training.iloc[row].values
            
            #for total error sum[(out - desired)^2]. (-1)^2 = 1 and 1^2 = 1 so += 1 when they're different
            if x[2] != scaled_x:
                error += 1
        
        
        i += 1
    
    print(f"Total error in training is: {error}")
    
    
    
    #testing
    true_positive = false_positive = true_negative = false_negative = 0
    for row in range(testing.shape[0]):
        x = testing.iloc[row].values
        predicted = predict(w, x)
        true = x[2]
        
        if predicted == 1 and true == 1:
            true_positive += 1
        elif predicted == 1 and true == 0:
            false_positive += 1
        elif predicted == 0 and true == 1:
            false_negative += 1
        else:
            true_negative += 1
    
    accuracy = (true_positive + true_negative) / testing.shape[0]
    true_positive_rate = true_positive / (true_positive + false_positive) * 100
    false_positive_rate = false_positive / (true_positive + false_positive) * 100
    true_negative_rate = true_negative / (true_negative + false_negative) * 100
    false_negative_rate = false_negative / (true_negative + false_negative) * 100
    
    
    print(f"accuracy: {accuracy}")
    print("\t\tTrue yes\t\tTrue no")
    print(f"Predicted yes\t{true_positive} ({true_positive_rate}%)\t\t{false_positive} ({false_positive_rate}%)")
    print(f"Predicted no\t{false_negative} ({false_negative_rate}%)\t\t{true_negative} ({true_negative_rate}%)")
    
    
    
    
    
    
    
    # # Plotting
    # plt.scatter(training.iloc[:, 0], training.iloc[:, 1], c=training.iloc[:, 2], cmap='viridis', label='Training Data')
    # plt.scatter(testing.iloc[:, 0], testing.iloc[:, 1], c=testing.iloc[:, 2], cmap='viridis', marker='x', label='Testing Data')

    # #setting x and y variables
    # # x = np.linspace(start=min(dataset.iloc[:, 0]), stop=max(dataset.iloc[:, 0]), num=100)
    # x = np.linspace(training.iloc[:, 0].min(), training.iloc[:, 0].max(), 100)  # 100 points between min and max x values y_vals = - (w[0] / w[1]) * x_vals - (w[2] / w[1]) plt.plot(x_vals, y_vals, 'k--')  # 'k--' specifies a black dashed line
    # y = (-w[0] * x - w[2]) / w[1]
    # plt.plot(x, y, '-r', label='Decision Boundary')



    # plt.plot(np.linspace(-10,10, 100))

    # #labeling axis
    # plt.xlabel('Normalized Cost(USD)')
    # plt.ylabel('Normalized Weight(pounds)')
    # plt.legend(loc='best')
    # plt.show()
    
    # w = w / np.linalg.norm(w)
    
    plt.figure()
    plt.scatter(training.iloc[:, 0], training.iloc[:, 1], c=training.iloc[:, 2], cmap='viridis', label='Training Data')
    plt.scatter(testing.iloc[:, 0], testing.iloc[:, 1], c=testing.iloc[:, 2], cmap='viridis', marker='x', label='Testing Data')

    x = np.linspace(0, 1, 400)
    y = -(w[0]/w[1]) * x - (w[2]/w[1])   

    plt.plot(x, y, '-r')
    plt.show()
    
    return w

In [149]:
#read the datasets
a = pd.read_csv("groupA.csv", header = None)
b = pd.read_csv("groupB.csv", header = None)
c = pd.read_csv("groupC.csv", header = None)



#convert to numeric
for col in a.columns:
    a[col] = a[col].apply(pd.to_numeric, errors = 'coerce')
    b[col] = b[col].apply(pd.to_numeric, errors = 'coerce')
    c[col] = c[col].apply(pd.to_numeric, errors = 'coerce')
    
    

#normalize the data
norm(a)
norm(b)
norm(c)

In [150]:
# #hard activation
# print("HARD ACTIVATION")
# print("---------------------------------------")
# print("\nGroupA")
# print(train(a, 3/4, 0.00001, 0.1))
# print(train(a, 1/4, 0.00001, 0.1))

# print("\nGroupB")
# print(train(b, 3/4, 40, 0.1))
# print(train(b, 1/4, 40, 0.1))

# print("\nGroupC")
# print(train(c, 3/4, 700, 0.1)) #fix the train method
# print(train(c, 1/4, 700, 0.1))

In [151]:
#soft activation
print("SOFT ACTIVATION")
print("---------------------------------------")
print("\nGroupA")
print(train(a, 3/4, 0.00001, 0.1)) #75
print(train(a, 1/4, 0.00001, 0.1)) #25

print("\nGroupB")
print(train(b, 3/4, 40, 0.1))#75
print(train(b, 1/4, 40, 0.1))#25

print("\nGroupC")
print(train(c, 3/4, 700, 0.1))#75
print(train(c, 1/4, 700, 0.1))#25



SOFT ACTIVATION
---------------------------------------

GroupA
