**Feature Engineering Task 1:**

In [17]:
import pandas as pd
import numpy as np

# Load the data from a CSV file
data = pd.read_csv("data.csv")
data.to_csv("editedData.csv")

# Iterate through each column
for column in data.columns:
    # Check if the column has any missing or zero values
    if data[column].isna().sum() + (data[column] == 0).sum() > 0:
        # Check if the column is categorical
        if data[column].dtype == 'object':
            # Impute missing and zero values with the most frequent value
            mode_value = data[column].mode()[0]
            data[column].fillna(mode_value, inplace=True)
            data[column] = data[column].replace(0, mode_value)
           # print("Column '{}' has been updated. Missing or zero values before: {}. Missing or zero values after: {}. Imputed with value: {}".format(column, data[column].isna().sum() + (data[column] == 0).sum(), data[column].isna().sum() + (data[column] == 0).sum(), mode_value))
        # Otherwise, assume it's a continuous numerical value
        else:
            # Impute missing and zero values with the mean value
            mean_value = data[column].replace(0, pd.np.nan).mean()
            data[column].fillna(mean_value, inplace=True)
            data[column] = data[column].replace(0, mean_value)
          #  print("Column '{}' has been updated. Missing or zero values before: {}. Missing or zero values after: {}. Imputed with value: {}".format(column, data[column].isna().sum() + (data[column] == 0).sum(), data[column].isna().sum() + (data[column] == 0).sum(), mean_value))

# Save the updated data to the original CSV file

data["diagnosis"] = (data["diagnosis"] =="M").astype(int)
data.to_csv("editedData.csv", index=False)

# Print the updated data
print(data["diagnosis"])


0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64


  mean_value = data[column].replace(0, pd.np.nan).mean()


**Feature Engineering Task 2:**

In [18]:
import pandas as pd
import numpy as np

# load the dataset as a Pandas DataFrame
df = pd.read_csv('editedData.csv')
# extract the numeric columns except the first two columns and convert to a Numpy array
numeric_cols = df.iloc[:, 2:].select_dtypes(include=[np.number]).columns
dataset = df[numeric_cols].values

# calculate the mean and standard deviation of each feature
mu = np.mean(dataset, axis=0)
sigma = np.std(dataset, axis=0)

# apply feature normalization
normalized_dataset = (dataset - mu) / sigma

# update the original DataFrame with the normalized values
df.loc[:, numeric_cols] = normalized_dataset

# save the normalized dataset back to the normData.csv file
df.to_csv('normData.csv', index=False)



**Part A - Perceptron Learning Algorithm:**


***Learning Task 1: ***

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('editedData.csv')
data = data.drop('id', axis=1)  # Drop the id column
train_data = data.sample(frac=0.67, random_state=1)  # Randomly select 67% of the data for training
test_data = data.drop(train_data.index)  # Use the remaining data for testing
class Perceptron:
    def __init__(self, input_size, lr=0.01, epochs=50):
        self.weights = np.zeros(input_size)
        self.lr = lr
        self.epochs = epochs

    def predict(self, x):
        z = np.dot(x, self.weights)
        return np.where(z > 0, 1, 0)

    def train(self, X, y):
        for epoch in range(self.epochs):
            for i in range(X.shape[0]):
                y_pred = self.predict(X[i])
                error = y[i] - y_pred
                self.weights += self.lr * error * X[i]
def evaluate(model, test_data):
    X_test = test_data.iloc[:, 1:].values
    y_test = test_data.iloc[:, 0].values
    y_pred = model.predict(X_test)
    accuracy = np.mean(y_pred == y_test)
    return accuracy

pm1 = Perceptron(input_size=train_data.shape[1]-1)
pm1.train(train_data.iloc[:, 1:].values, train_data.iloc[:, 0].values)
pm1_acc = evaluate(pm1, test_data)
print(f"PM1 accuracy: {pm1_acc}")

pm2 = Perceptron(input_size=train_data.shape[1]-1)
train_data = train_data.sample(frac=1)  # Shuffle the training data
pm2.train(train_data.iloc[:, 1:].values, train_data.iloc[:, 0].values)
pm2_acc = evaluate(pm2, test_data)
print(f"PM2 accuracy: {pm2_acc}")

PM1 accuracy: 0.9202127659574468
PM2 accuracy: 0.8936170212765957


Here we have define the perceptron algorithm
Then, we have define a function to evaluate the performance of the model on the test set:
Now, we can use the perceptron algorithm to train two models (PM1 and PM2) by changing the order of training examples:
We can observe that PM1 and PM2 have different accuracies. This is because the order of training examples affects the final weights of the model.





**Task 2: Building Perceptron Model PM3 on Normalized Data**

In [20]:
import pandas as pd
import numpy as np


class Perceptron:
    def __init__(self, input_size, lr=0.01, epochs=50):
        self.weights = np.zeros(input_size)
        self.lr = lr
        self.epochs = epochs

    def predict(self, x):
        z = np.dot(x, self.weights)
        return np.where(z > 0, 1, 0)

    def train(self, X, y):
        for epoch in range(self.epochs):
            for i in range(X.shape[0]):
                y_pred = self.predict(X[i])
                error = y[i] - y_pred
                self.weights += self.lr * error * X[i]
def evaluate(model, test_data):
    X_test = test_data.iloc[:, 1:].values
    y_test = test_data.iloc[:, 0].values
    y_pred = model.predict(X_test)
    accuracy = np.mean(y_pred == y_test)
    return accuracy

# Load data
data = pd.read_csv('normData.csv')
data = data.drop('id', axis=1)  # Drop the id column
train_data1 = data.sample(frac=0.67, random_state=1)
test_data1 = data.drop(train_data.index)

# Train PM3 on normalized data
pm3 = Perceptron(input_size=train_data1.shape[1]-1)
pm3.train(train_data1.iloc[:, 1:].values, train_data1.iloc[:, 0].values)
pm3_acc = evaluate(pm3,test_data1)
print(f"PM3 accuracy on normalized data: {pm3_acc}")


PM3 accuracy on normalized data: 0.973404255319149


Here, we used the normalised training data to train the PM3 model and the normalised testing data to assess its performance. Due to the normalised data, we can see that PM3 has a different accuracy than PM1 and PM2. As a result, we now know that normalised models provide more accuracy than unnormalized ones.



**Task 3: Building Perceptron Model PM4 on Randomly Permutated Features**

To build PM4, we will randomly permute the order of features in the dataset:

Here, we used the sample approach to shuffle the columns (features) of the training data, and we then trained the PM4 model using the shuffled training data. Only the features with weights greater than zero in the PM4 model were used to evaluate its performance using the testing data. This is due to the fact that the weights of the other features were set to zero at initialization and stayed that way throughout the training procedure. Due to the randomly permuted characteristics, we can see that PM4 has a different accuracy than PM1, PM2, and PM3.

In [21]:

import pandas as pd
import numpy as np

class Perceptron:
    def __init__(self, input_size, lr=0.01, epochs=50):
        self.weights = np.zeros(input_size)
        self.lr = lr
        self.epochs =   epochs

    def predict(self, x):
        z = np.dot(x, self.weights)
        return np.where(z > 0, 1, 0)

    def train(self, X, y):
        for epoch in range(self.epochs):
            for i in range(X.shape[0]):
                y_pred = self.predict(X[i])
                error = y[i] - y_pred
                self.weights += self.lr * error * X[i]
def evaluate(model, test_data):
    X_test = test_data.iloc[:, 1:].values
    y_test = test_data.iloc[:, 0].values
    y_pred = model.predict(X_test)
    accuracy = np.mean(y_pred == y_test)
    return accuracy

# Load data
data = pd.read_csv('editedData.csv')

# Drop id column
data = data.drop('id', axis=1)

train_data = data.sample(frac=0.67, random_state=1)
test_data = data.drop(train_data.index)


# Randomly permute feature order
np.random.seed(1)
perm = np.random.permutation(train_data.shape[1]-1) + 1
train_data_permuted = train_data.iloc[:, np.concatenate(([0], perm))]

# Train PM4 on permuted data
pm4 = Perceptron(input_size=train_data_permuted.shape[1]-1)
pm4.train(train_data_permuted.iloc[:, 1:].values, train_data_permuted.iloc[:, 0].values)


merged_test_data = pd.concat([test_data.iloc[:, 0], test_data.iloc[:, perm]], axis=1)
num_columns1 = data.shape[1]
num_columns2 = train_data.shape[1]
num_columns3 = test_data.shape[1]

# print("Number of columns in data :", num_columns1)
# print("Number of columns in train data :", num_columns2)
# print("Number of columns in test_data :", num_columns3)

num_columns = merged_test_data.shape[1]

#print("Number of columns in merged data :", num_columns)

merged_test_data.to_csv('test.csv', index=False)

pm4_acc = evaluate(pm4, merged_test_data)
print("PM4 Accuracy:", pm4_acc)



PM4 Accuracy: 0.9202127659574468


**Accuracy of 10 random samples for PM1**

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

class Perceptron:
    def __init__(self, input_size, lr=0.01, epochs=50):
        self.weights = np.zeros(input_size)
        self.lr = lr
        self.epochs = epochs

    def predict(self, x):
        z = np.dot(x, self.weights)
        return np.where(z > 0, 1, 0)

    def train(self, X, y):
        for epoch in range(self.epochs):
            for i in range(X.shape[0]):
                y_pred = self.predict(X[i])
                error = y[i] - y_pred
                self.weights += self.lr * error * X[i]
def evaluate(model, test_data):
    X_test = test_data.iloc[:, 1:].values
    y_test = test_data.iloc[:, 0].values
    y_pred = model.predict(X_test)
    accuracy = np.mean(y_pred == y_test)
    return accuracy



data = pd.read_csv('editedData.csv')
data = data.drop('id', axis=1)  # Drop the id column



accuracies = []
for i in range(10):
    train_data = data.sample(frac=0.67, random_state=np.random.randint(1,100))  # Randomly select 67% of the data for training
    test_data = data.drop(train_data.index)  # Use the remaining data for testing

    pm1 = Perceptron(input_size=train_data.shape[1]-1)
    pm1.train(train_data.iloc[:, 1:].values, train_data.iloc[:, 0].values)
    pm1_acc = evaluate(pm1, test_data)
    print(f"PM1 accuracy: {pm1_acc}")
    accuracies.append(pm1_acc)

print(f"Mean accuracy: {np.mean(accuracies):.8f}")
print(f"Standard deviation: {np.std(accuracies):.8f}")


PM1 accuracy: 0.8882978723404256
PM1 accuracy: 0.9414893617021277
PM1 accuracy: 0.9414893617021277
PM1 accuracy: 0.8617021276595744
PM1 accuracy: 0.925531914893617
PM1 accuracy: 0.9095744680851063
PM1 accuracy: 0.898936170212766
PM1 accuracy: 0.8829787234042553
PM1 accuracy: 0.8829787234042553
PM1 accuracy: 0.898936170212766
Mean accuracy: 0.90319149
Standard deviation: 0.02503958


**Accuracy of 10 random samples for PM3**

In [23]:
import pandas as pd
import numpy as np


class Perceptron:
    def __init__(self, input_size, lr=0.01, epochs=50):
        self.weights = np.zeros(input_size)
        self.lr = lr
        self.epochs = epochs

    def predict(self, x):
        z = np.dot(x, self.weights)
        return np.where(z > 0, 1, 0)

    def train(self, X, y):
        for epoch in range(self.epochs):
            for i in range(X.shape[0]):
                y_pred = self.predict(X[i])
                error = y[i] - y_pred
                self.weights += self.lr * error * X[i]
def evaluate(model, test_data):
    X_test = test_data.iloc[:, 1:].values
    y_test = test_data.iloc[:, 0].values
    y_pred = model.predict(X_test)
    accuracy = np.mean(y_pred == y_test)
    return accuracy

# Load data
data = pd.read_csv('normData.csv')
data = data.drop('id', axis=1)  # Drop the id column
accuracies = []
for i in range(10):
    train_data1 = data.sample(frac=0.67, random_state=np.random.randint(1,100))
    test_data1 = data.drop(train_data.index)

    # Train PM3 on normalized data
    pm3 = Perceptron(input_size=train_data1.shape[1]-1)
    pm3.train(train_data1.iloc[:, 1:].values, train_data1.iloc[:, 0].values)
    pm3_acc = evaluate(pm3,test_data1)
    print(pm3_acc)
    accuracies.append(pm3_acc)

print(f"Mean accuracy: {np.mean(accuracies):.8f}")
print(f"Standard deviation: {np.std(accuracies):.8f}")


0.9946808510638298
0.9893617021276596
0.9787234042553191
0.9893617021276596
0.9787234042553191
0.9787234042553191
0.9893617021276596
0.9893617021276596
0.9893617021276596
0.9787234042553191
Mean accuracy: 0.98563830
Standard deviation: 0.00585106


**Average accuracy for 10 random samples: PM4**

In [24]:

import pandas as pd
import numpy as np

# Load data
data = pd.read_csv('editedData.csv')

# Drop id column
data = data.drop('id', axis=1)
accuracies = []
for i in range(10):
    # data = data.sample(frac=1)
    train_data = data.sample(frac=0.67, random_state=np.random.randint(1,100))
    test_data = data.drop(train_data.index)
    # split the data into training and testing sets
  

    # Randomly permute feature order
    np.random.seed(1)
    perm = np.random.permutation(train_data.shape[1]-1) + 1
    train_data_permuted = train_data.iloc[:, np.concatenate(([0], perm))]

    # Train PM4 on permuted data
    pm4 = Perceptron(input_size=train_data_permuted.shape[1]-1)
    pm4.train(train_data_permuted.iloc[:, 1:].values, train_data_permuted.iloc[:, 0].values)


    merged_test_data = pd.concat([test_data.iloc[:, 0], test_data.iloc[:, perm]], axis=1)
    #merged_test_data = pd.concat([test_data.iloc[:, 0], test_data.iloc[:, np.concatenate(([0], perm))]], axis=1)


    pm4_acc = evaluate(pm4, merged_test_data)
    print(pm4_acc)
    accuracies.append(pm4_acc)

print(f"Mean accuracy: {np.mean(accuracies):.8f}")
print(f"Standard deviation: {np.std(accuracies):.8f}")



0.9042553191489362
0.8882978723404256
0.8882978723404256
0.8882978723404256
0.8882978723404256
0.8882978723404256
0.8882978723404256
0.8882978723404256
0.8882978723404256
0.8882978723404256
Mean accuracy: 0.88989362
Standard deviation: 0.00478723
