# Training Data

In [1]:
import numpy as np
File_data = np.genfromtxt("1_2.txt", dtype=str,usecols=(1,2,3))
row = (line.replace('(','').replace(')','') for line in File_data if line)

In [2]:
list1 = []
with open('1_2.txt','r') as in_file:
    stripped = (line.strip() for line in in_file)
    row = (line.replace('(','').replace(')','') for line in stripped if line)
    lines = (line.split(",") for line in row if line)
    for i in lines:
        list1.append(i)

In [3]:
a = np.array(list1)
rows = ["{},{},{},{}".format(i, j, k,l) for i, j, k, l in a]
text = "\n".join(rows)

with open('train.csv', 'w') as f:
    f.write(text)

In [4]:
train = []
with open('train.csv', 'r') as file:
    for line in file:
        string = line.strip()
        words = string.split(',')
        train.append(words)
train

[['1.6530190426733', ' 72.871146648479', ' 24', ' W'],
 ['1.6471384909498', ' 72.612785314988', ' 34', ' W'],
 ['1.6472055785348', ' 73.53968351051', ' 33', ' M'],
 ['1.7323008914951', ' 76.067870338779', ' 30', ' M'],
 ['1.6750702657911', ' 81.05582111533', ' 30', ' M'],
 ['1.5780970716644', ' 64.926084680188', ' 30', ' W'],
 ['1.6587629355524', ' 69.38092449041', ' 30', ' M'],
 ['1.6763295980234', ' 77.062295990149', ' 31', ' M'],
 ['1.7187224085504', ' 62.112923317057', ' 37', ' W'],
 ['1.5202218226439', ' 66.151444019603', ' 27', ' W'],
 ['1.5552689261884', ' 66.076386143769', ' 31', ' W'],
 ['1.6969333189258', ' 77.45386244568', ' 34', ' M'],
 ['1.6887980792886', ' 76.489640732464', ' 37', ' M'],
 ['1.5213552893624', ' 63.952944947832', ' 35', ' W']]

In [5]:
train[0]

['1.6530190426733', ' 72.871146648479', ' 24', ' W']

# Testing Data

In [6]:
File_data1 = np.genfromtxt("1_2_Test.txt", dtype=str,usecols=(1,2,3))
row = (line.replace('(','').replace(')','') for line in File_data1 if line)

In [7]:
list2 = []
with open('1_2_Test.txt','r') as in_file:
    stripped = (line.strip() for line in in_file)
    row = (line.replace('(','').replace(')','') for line in stripped if line)
    lines = (line.split(",") for line in row if line)
    for i in lines:
        list2.append(i)

In [8]:
b = np.array(list2)
rows = ["{},{},{}".format(i, j, k) for i, j, k in b]
text = "\n".join(rows)
 
with open('test.csv', 'w') as f:
    f.write(text)

In [9]:
with open('test.csv', 'r') as file:
    test = []
    for line in file:
        string = line.strip()
        words = string.split(',')
        test.append(words)
test

[['1.62065758929', ' 59.376557437583', ' 32'],
 ['1.7793983848363', ' 72.071775670801', ' 36'],
 ['1.7004576585974', ' 66.267508112786', ' 31'],
 ['1.6591086215159', ' 61.751621901787', ' 29']]

# Algorithm

In [10]:

from math import sqrt
import numpy as np
 
# Converted string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())
 
# Converted string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup
 
# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax
 
# Rescaled dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
 
# Introduced K folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = np.random.choice(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split
 
# Calculated accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0
 
# Evaluated an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args,distance_method):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            if distance_method != 3:
                row_copy[-1] = None
            else:
                pass
        predicted = algorithm(train_set, test_set, *args,distance_method)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores
 
# Calculated the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

# Calculated the Manhattan distance between two vectors
def manhattan_distance(row1,row2):
    distance = 0
    for i in range(len(row1)-1):
        difference = (row1[i]-row2[i])
        absolute_difference = abs(difference)
        distance += absolute_difference
    return distance

# Calculated the minkowski distance between two vectors
def minkowski_distance(row1, row2, p):
    return sum(abs(e1-e2)**p for e1, e2 in zip(row1,row2))**(1/p)

# Locate the most similar neighbors
def get_neighbors(train_row, test_row, num_neighbors,distance_method):
    distances = list()
    num_neighbors +=1
    for train_row in train:
        if distance_method == 1:
            dist = euclidean_distance(test_row, train_row)
            distances.append((train_row, dist))
        elif distance_method == 2:
            dist = manhattan_distance(test_row, train_row)
            distances.append((train_row, dist))
        elif distance_method == 3:
            dist = minkowski_distance(test_row,train_row,num_neighbors)
            distances.append((train_row, dist))
        else:
            dist = euclidean_distance(test_row, train_row)
            distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors
 
# Made a prediction with neighbors
def predict_classification(train, test_row, num_neighbors,distance_method):
    num_neighbors +=1
    neighbors = get_neighbors(train, test_row, num_neighbors,distance_method)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction
 
# kNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors,distance_method):
    predictions = list()
    num_neighbors +=1
    for row in test:
        output = predict_classification(train, row, num_neighbors,distance_method)
        predictions.append(output)
    return(predictions)
# Predict the Label
def label_predict(label):
    if label == 0:
        l = 'W'
    else:
        l='M'
    return l

In [11]:
for i in range(len(test[0])):
    str_column_to_float(test, i)

# For Value of K=1

In [12]:
dataset = train
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
#convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm

{' W': 0, ' M': 1}

In [13]:
n_folds = 4
num_neighbors = 1
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=1)
print('Eucledian')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=2)
print('Manhattan')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=3)
print('Minkowski')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Eucledian
Scores: [66.66666666666666, 100.0, 100.0, 66.66666666666666]
Mean Accuracy: 83.333%
Manhattan
Scores: [33.33333333333333, 100.0, 100.0, 100.0]
Mean Accuracy: 83.333%
Minkowski
Scores: [100.0, 100.0, 100.0, 33.33333333333333]
Mean Accuracy: 83.333%


# For Value of K=3

In [14]:
dataset = train
num_neighbors = 3
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=1)
print('Eucledian')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=2)
print('Manhattan')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=3)
print('Minkowski')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Eucledian
Scores: [66.66666666666666, 100.0, 66.66666666666666, 66.66666666666666]
Mean Accuracy: 75.000%
Manhattan
Scores: [66.66666666666666, 66.66666666666666, 100.0, 100.0]
Mean Accuracy: 83.333%
Minkowski
Scores: [100.0, 100.0, 33.33333333333333, 100.0]
Mean Accuracy: 83.333%


# # For Value of K= 7

In [15]:
dataset = train
num_neighbors = 7
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=1)
print('Eucledian')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=2)
print('Manhattan')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=3)
print('Minkowski')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Eucledian
Scores: [100.0, 66.66666666666666, 100.0, 66.66666666666666]
Mean Accuracy: 83.333%
Manhattan
Scores: [100.0, 66.66666666666666, 66.66666666666666, 100.0]
Mean Accuracy: 83.333%
Minkowski
Scores: [66.66666666666666, 100.0, 66.66666666666666, 100.0]
Mean Accuracy: 83.333%


# Program Data

In [16]:
File_data1 = np.genfromtxt("PData.txt", dtype=str,usecols=(1,2,3))
row = (line.replace('(','').replace(')','') for line in File_data1 if line)
list3 = []
with open('PData.txt','r') as in_file:
    stripped = (line.strip() for line in in_file)
    row = (line.replace('(','').replace(')','') for line in stripped if line)
    lines = (line.split(",") for line in row if line)
    for i in lines:
        list3.append(i)
c = np.array(list3)
rows = ["{},{},{},{}".format(i, j, k, l) for i, j, k, l in c]
text = "\n".join(rows)
 
with open('pdata.csv', 'w') as f:
    f.write(text)
with open('pdata.csv', 'r') as file:
    pdata = []
    for line in file:
        string = line.strip()
        words = string.split(',')
        pdata.append(words)

In [17]:
pdata_age=[]
for i in pdata:
    pdata_age.append(i)

# Predict The Label For Train Data

In [18]:
# predict the label
dataset = train
for i in range(len(train)):
    label = predict_classification(train, train[i], 1,distance_method='manhattan')
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    print('Expected %s, Got %s.' % (label_predict(dataset[i][-1]), label_predict(label)))

Expected W, Got M.
Expected W, Got M.
Expected M, Got M.
Expected M, Got M.
Expected M, Got M.
Expected W, Got W.
Expected M, Got W.
Expected M, Got M.
Expected W, Got W.
Expected W, Got W.
Expected W, Got W.
Expected M, Got M.
Expected M, Got M.
Expected W, Got W.


# Predict The Label for Test Data

#  When K is 1

In [19]:
eu =[]
for i in range(len(test)):
    label = predict_classification(train, test[i], 1,distance_method=1)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    eu.append(label_predict(label))

In [20]:
man = []
for i in range(len(test)):
    label = predict_classification(train, test[i], 1,distance_method=2)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    man.append(label_predict(label))

In [21]:
min = []
for i in range(len(test)):
    label = predict_classification(train, test[i], 1,distance_method=3)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    min.append(label_predict(label))
    
print(f'For k = 1, Metric = Eucledian --> {eu}')
print(f'For k = 1, Metric = Manhattan --> {man}')
print(f'For k = 1, Metric = Minkowski --> {min}')

For k = 1, Metric = Eucledian --> ['W', 'W', 'W', 'W']
For k = 1, Metric = Manhattan --> ['W', 'W', 'W', 'W']
For k = 1, Metric = Minkowski --> ['W', 'M', 'W', 'W']


# When k =3 

In [22]:
eu =[]
for i in range(len(test)):
    label = predict_classification(train, test[i], 3,distance_method=1)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    eu.append(label_predict(label))
man = []
for i in range(len(test)):
    label = predict_classification(train, test[i], 3,distance_method=2)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    man.append(label_predict(label))
min = []
for i in range(len(test)):
    label = predict_classification(train, test[i], 3,distance_method=3)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    min.append(label_predict(label))
    
print(f'For k = 3, Metric = Eucledian --> {eu}')
print(f'For k = 3, Metric = Manhattan --> {man}')
print(f'For k = 3, Metric = Minkowski --> {min}')

For k = 3, Metric = Eucledian --> ['W', 'M', 'W', 'W']
For k = 3, Metric = Manhattan --> ['W', 'M', 'W', 'W']
For k = 3, Metric = Minkowski --> ['W', 'M', 'W', 'W']


# When k=7

In [23]:
eu =[]
for i in range(len(test)):
    label = predict_classification(train, test[i], 7,distance_method=1)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    eu.append(label_predict(label))
man = []
for i in range(len(test)):
    label = predict_classification(train, test[i], 7,distance_method=2)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    man.append(label_predict(label))
min = []
for i in range(len(test)):
    label = predict_classification(train, test[i], 7,distance_method=3)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    min.append(label_predict(label))
    
print(f'For k = 7, Metric = Eucledian --> {eu}')
print(f'For k = 7, Metric = Manhattan --> {man}')
print(f'For k = 7, Metric = Minkowski --> {min}')

For k = 7, Metric = Eucledian --> ['W', 'M', 'W', 'W']
For k = 7, Metric = Manhattan --> ['W', 'M', 'W', 'W']
For k = 7, Metric = Minkowski --> ['W', 'M', 'W', 'W']


# Execute the Program Data for Score and Accuracy

In [24]:
dataset = pdata
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 5
num_neighbors = 1
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method='manhattan')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [58.333333333333336, 83.33333333333334, 66.66666666666666, 41.66666666666667, 62.5]
Mean Accuracy: 62.500%


# For value of k is 1,3,5,7,9,11 The Accuracy are

In [25]:
for i in range (1,12,2):
    num_neighbors = i
    scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=1)
    accuracy =(sum(scores)/float(len(scores)))
    print(f'For k ={i}, Accuracy ={accuracy}')
   # print('Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

For k =1, Accuracy =62.5
For k =3, Accuracy =58.33333333333333
For k =5, Accuracy =58.33333333333333
For k =7, Accuracy =60.83333333333333
For k =9, Accuracy =59.16666666666667
For k =11, Accuracy =55.0


# Remove the Age Column

In [26]:
for i in range(len(pdata_age)):
    for j in range(len(pdata_age[i])):
        if j == 2:
            del pdata_age[i][2]

In [27]:
len(pdata_age)

120

In [28]:
dataset = pdata_age

# evaluate algorithm
n_folds = 5
num_neighbors = 3
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=2)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [58.333333333333336, 50.0, 75.0, 54.166666666666664, 54.166666666666664]
Mean Accuracy: 58.333%


# Predict The Test after the droping the Age olumn

In [29]:
eu = []
for i in range(len(test)):
    label = predict_classification(pdata_age, test[i], 5,distance_method=1)
    #print('Data=%s, Predicted: %s' % (train[i], label_predict(label)))
    eu.append(label_predict(label))
    #print('Expected %s, Got %s.' % (label_predict(train[i][-1]), label_predict(label)))
eu

['W', 'M', 'W', 'W']

# Input the Text File

In [None]:
txt_file = input('Enter the path of the text file')
print(txt_file)

In [None]:
File_data = np.genfromtxt(txt_file, dtype=str,usecols=(1,2,3))
row = (line.replace('(','').replace(')','') for line in File_data if line)

list2 = []
with open(txt_file,'r') as in_file:
    stripped = (line.strip() for line in in_file)
    row = (line.replace('(','').replace(')','') for line in stripped if line)
    lines = (line.split(",") for line in row if line)
    for i in lines:
        list2.append(i)
        
b = np.array(list2)
rows = ["{},{},{},{}".format(i, j, k,l) for i, j, k,l in b]
text = "\n".join(rows)

# Export the csv file user defined

In [None]:
name = input('Enter the csv filename')
filename = '%s.csv' % name

In [None]:
with open(filename, 'w') as f:
    f.write(text)

with open(filename, 'r') as file:
    pdata = []
    for line in file:
        string = line.strip()
        words = string.split(',')
        pdata.append(words)

# User Defined Inputs

In [None]:
num_neighbors = int(input('Enter the value of k = '))
print('Type Distance Method for Algo from [ Eucledian , Manhattan, Minkowski ]')
distance_method = input('Enter the Algo name from above')

scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors,distance_method=distance_method)
print(num_neighbors,distance_method)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))