# Training Data

In [10]:
import numpy as np
File_data = np.genfromtxt("1_2.txt", dtype=str,usecols=(1,2,3))
row = (line.replace('(','').replace(')','') for line in File_data if line)

list1 = []
with open('1_2.txt','r') as in_file:
    stripped = (line.strip() for line in in_file)
    row = (line.replace('(','').replace(')','') for line in stripped if line)
    lines = (line.split(",") for line in row if line)
    for i in lines:
        list1.append(i)

a = np.array(list1)
rows = ["{},{},{},{}".format(i, j, k,l) for i, j, k, l in a]
text = "\n".join(rows)

with open('train.csv', 'w') as f:
    f.write(text)

train = []
with open('train.csv', 'r') as file:
    for line in file:
        string = line.strip()
        words = string.split(',')
        train.append(words)

# Testing Data

In [11]:
File_data1 = np.genfromtxt("1_2_Test.txt", dtype=str,usecols=(1,2,3))
row = (line.replace('(','').replace(')','') for line in File_data1 if line)

list2 = []
with open('1_2_Test.txt','r') as in_file:
    stripped = (line.strip() for line in in_file)
    row = (line.replace('(','').replace(')','') for line in stripped if line)
    lines = (line.split(",") for line in row if line)
    for i in lines:
        list2.append(i)

b = np.array(list2)
rows = ["{},{},{}".format(i, j, k) for i, j, k in b]
text = "\n".join(rows)
 
with open('test.csv', 'w') as f:
    f.write(text)

with open('test.csv', 'r') as file:
    test = []
    for line in file:
        string = line.strip()
        words = string.split(',')
        test.append(words)

# ALgorithm

In [12]:
from math import sqrt
from math import exp
from math import pi
 
# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())
 
# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup
 
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = np.random.choice(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split
 
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0
 
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores
 
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated
 
# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)
 
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries
 
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries
 
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent
 
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    #print([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities
 
# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label
 
# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return(predictions)

# Predict the Label
def label_predict(label):
    if label == 0:
        l = 'W'
    else:
        l='M'
    return l

# Execution

In [13]:
dataset = train
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
# evaluate algorithm


{' W': 0, ' M': 1}

In [14]:
n_folds = 7
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [100.0, 50.0, 100.0, 100.0, 50.0, 100.0, 50.0]
Mean Accuracy: 78.571%


In [15]:
# fit model
model = summarize_by_class(dataset)

# Prediction

In [16]:
dataset = test
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

In [17]:
# predict the label
pred = []
for i in range(len(dataset)):
    label = predict(model, dataset[i])
    pred.append(label_predict(label))
pred

['W', 'W', 'W', 'W']

# Program Data

In [18]:
File_data1 = np.genfromtxt("PData.txt", dtype=str,usecols=(1,2,3))
row = (line.replace('(','').replace(')','') for line in File_data1 if line)
list3 = []
with open('PData.txt','r') as in_file:
    stripped = (line.strip() for line in in_file)
    row = (line.replace('(','').replace(')','') for line in stripped if line)
    lines = (line.split(",") for line in row if line)
    for i in lines:
        list3.append(i)
c = np.array(list3)
rows = ["{},{},{},{}".format(i, j, k, l) for i, j, k, l in c]
text = "\n".join(rows)
 
with open('pdata.csv', 'w') as f:
    f.write(text)
with open('pdata.csv', 'r') as file:
    pdata = []
    for line in file:
        string = line.strip()
        words = string.split(',')
        pdata.append(words)

In [19]:
pdata_age=[]
for i in pdata:
    pdata_age.append(i)

# Leave one out with kfolds

In [20]:
dataset = pdata
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
# evaluate algorithm
for i in range(2,12,2):
    n_folds = i
    scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
    print('Scores: %s' % scores)
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [66.66666666666666, 68.33333333333333]
Mean Accuracy: 67.500%
Scores: [73.33333333333333, 63.33333333333333, 80.0, 63.33333333333333]
Mean Accuracy: 70.000%
Scores: [75.0, 80.0, 65.0, 70.0, 70.0, 65.0]
Mean Accuracy: 70.833%
Scores: [73.33333333333333, 66.66666666666666, 66.66666666666666, 80.0, 73.33333333333333, 86.66666666666667, 66.66666666666666, 66.66666666666666]
Mean Accuracy: 72.500%
Scores: [50.0, 75.0, 58.333333333333336, 75.0, 75.0, 75.0, 75.0, 66.66666666666666, 75.0, 58.333333333333336]
Mean Accuracy: 68.333%


# Remove the Age Column

In [21]:
for i in range(len(pdata_age)):
    for j in range(len(pdata_age[i])):
        if j == 2:
            del pdata_age[i][2]


In [22]:
pdata_age

[[1.5963600450124, 75.717194178189, 0],
 [1.6990610819676, 83.477307503684, 1],
 [1.5052092436, 74.642420817737, 0],
 [1.5738635789008, 78.562465284603, 1],
 [1.796178772769, 74.566117057707, 1],
 [1.6274618774347, 82.250591567161, 0],
 [1.6396843250708, 71.37567170848, 0],
 [1.538505823668, 77.418902097029, 0],
 [1.6488692005889, 76.333044488477, 0],
 [1.7233804613095, 85.812112126306, 1],
 [1.7389100516771, 76.424421782215, 0],
 [1.5775696242624, 77.201404139171, 0],
 [1.7359417237856, 77.004988515324, 1],
 [1.5510482441354, 72.950756316157, 0],
 [1.5765653263667, 74.750113664457, 0],
 [1.4916026885377, 65.880438515643, 0],
 [1.6755053770068, 78.901754249459, 1],
 [1.4805881225567, 69.652364469244, 0],
 [1.6343943760912, 73.998278712613, 0],
 [1.6338449829543, 79.216500811112, 0],
 [1.5014451222259, 66.917339299419, 0],
 [1.8575887178701, 79.942454850988, 1],
 [1.6805940669394, 78.213519314007, 0],
 [1.6888905106948, 83.031099742808, 1],
 [1.7055120272359, 84.233282531303, 1],
 [1.56

# After Removing the Age Column

In [23]:
dataset = pdata_age
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

# fit model
model = summarize_by_class(dataset)

Scores: [50.0, 91.66666666666666, 75.0, 75.0, 58.333333333333336, 58.333333333333336, 91.66666666666666, 58.333333333333336, 83.33333333333334, 58.333333333333336]
Mean Accuracy: 70.000%


In [24]:
dataset = test

# evaluate algorithm
# predict the label
pred = []
for i in range(len(dataset)):
    label = predict(model, dataset[i])
    pred.append(label_predict(label))
pred

['W', 'M', 'W', 'W']

# Value of kfold = 3

In [25]:
# fit model
model = summarize_by_class(pdata)
# predict the label
for i in range(len(pdata)):
    label = predict(model, pdata[i])
    l = label_predict(label)
    print('Data=%s, Predicted: %s' % (pdata[i], l))
scores = evaluate_algorithm(pdata, naive_bayes, 3)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Data=[1.5963600450124, 75.717194178189, 0], Predicted: W
Data=[1.6990610819676, 83.477307503684, 1], Predicted: M
Data=[1.5052092436, 74.642420817737, 0], Predicted: W
Data=[1.5738635789008, 78.562465284603, 1], Predicted: W
Data=[1.796178772769, 74.566117057707, 1], Predicted: M
Data=[1.6274618774347, 82.250591567161, 0], Predicted: M
Data=[1.6396843250708, 71.37567170848, 0], Predicted: W
Data=[1.538505823668, 77.418902097029, 0], Predicted: W
Data=[1.6488692005889, 76.333044488477, 0], Predicted: W
Data=[1.7233804613095, 85.812112126306, 1], Predicted: M
Data=[1.7389100516771, 76.424421782215, 0], Predicted: M
Data=[1.5775696242624, 77.201404139171, 0], Predicted: W
Data=[1.7359417237856, 77.004988515324, 1], Predicted: M
Data=[1.5510482441354, 72.950756316157, 0], Predicted: W
Data=[1.5765653263667, 74.750113664457, 0], Predicted: W
Data=[1.4916026885377, 65.880438515643, 0], Predicted: W
Data=[1.6755053770068, 78.901754249459, 1], Predicted: M
Data=[1.4805881225567, 69.65236446924

# Input the Text File

In [None]:
txt_file = input('Enter the path of the text file')
print(txt_file)

File_data = np.genfromtxt(txt_file, dtype=str,usecols=(1,2,3))
row = (line.replace('(','').replace(')','') for line in File_data if line)

list2 = []
with open(txt_file,'r') as in_file:
    stripped = (line.strip() for line in in_file)
    row = (line.replace('(','').replace(')','') for line in stripped if line)
    lines = (line.split(",") for line in row if line)
    for i in lines:
        list2.append(i)
        
b = np.array(list2)
rows = ["{},{},{},{}".format(i, j, k,l) for i, j, k,l in b]
text = "\n".join(rows)

# Export the csv file user defined

In [None]:
name = input('Enter the csv filename')
filename = '%s.csv' % name

In [None]:
with open(filename, 'w') as f:
    f.write(text)

with open(filename, 'r') as file:
    pdata = []
    for line in file:
        string = line.strip()
        words = string.split(',')
        pdata.append(words)

In [None]:
dataset = pdata
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

In [None]:
# predict the label
for i in range(len(pdata)):
    label = predict(model, pdata[i])
    l = label_predict(label)
    print('Data=%s, Predicted: %s' % (pdata[i], l))
scores = evaluate_algorithm(pdata, naive_bayes, 3)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

In [None]:
model = summarize_by_class(pdata)

In [None]:
dataset = test

# evaluate algorithm
# predict the label
pred = []
for i in range(len(dataset)):
    label = predict(model, dataset[i])
    pred.append(label_predict(label))
pred