# Zad 2

In [1]:
from csv import reader #do wczytania bazy danych
from math import sqrt #
from math import exp #
from math import pi # do obliczenia prawdopodobieństwa
import random #do tasowania (troche na lenia, wiem)

In [2]:
#wczytanie bazy danych
def load_csv(filename):
    dataset = list()
    with open(filename, "r") as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

#zamieana wszystkich danych na float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())


#funkcja zamienia dane do int oraz tworzy dictionary
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values) #unikalne wartosci, dlatego do set
    lookup = dict() #tworzymy dictionary
    for i, value in enumerate(unique):
        lookup[value] = i
        print("[%s] => %d" % (value, i)) #przewidywanie wartosci
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

#musimy dane rozdzielic wzgledem klas
def separate_by_class(dataset):
    separated = dict() 
    for i in range(len(dataset)):
        vector = dataset[i] #konretny wiersz z bazy
        class_value = vector[-1]
        if class_value not in separated:
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

#obliczenia
def mean(numbers): 
    return sum(numbers) / float(len(numbers))


def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(variance)

#podsumowanie danych z bazy
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del summaries[-1] #usuwanie ostatniego atrybutu ktory przetrzymuje wynik rzeczywisty
    return summaries


def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

#obliczenie prawdopodobienstwa
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent


def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

#przewidywanie wyniku
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [3]:
filename = "train_data.data" #baza danych
dataset = load_csv(filename) 
random.shuffle(dataset)


for i in range(len(dataset[0])-1): #wczytujemy dane bez ostatniej kolumny
    str_column_to_float(dataset, i)

str_column_to_int(dataset, len(dataset[0])-1) #pomijamy ostatnia kolumne do wyniku

model = summarize_by_class(dataset)

test_file = "test_data.data"


with open(test_file) as fo:
    line = fo.readline()
    while line:
        tab = [float(x) for x in line.split(",")]
        label = predict(model, (tab[:-1]))
        print(f"Baza={tab[:-1]}, Przewidziane: {label}, Wartość dokładna: {tab[-1]}")
        line = fo.readline()

[ 1.0] => 0
[ 0.0] => 1
Baza=[0.7632, 0.0, 1.0, 0.8148, 0.5058, 0.0, 0.5, 0.4074, 1.0, 0.16, nan, nan, nan], Przewidziane: 0, Wartość dokładna: 0.0
Baza=[0.6316, 1.0, 0.3333, 0.2593, 0.3842, 0.0, 0.0, 0.3333, 0.0, 0.0, nan, nan, nan], Przewidziane: 0, Wartość dokładna: 0.0
Baza=[0.8158, 0.0, 1.0, 0.3519, 0.4884, 1.0, 0.5, 0.4444, 1.0, 0.3, nan, nan, nan], Przewidziane: 0, Wartość dokładna: 1.0
Baza=[0.2632, 1.0, 1.0, 0.1667, nan, 0.0, 0.0, 0.6296, 1.0, 0.2, nan, nan, nan], Przewidziane: 0, Wartość dokładna: 1.0
Baza=[0.5526, 0.0, 0.3333, 0.2963, 0.2239, 0.0, 0.0, 0.7593, 0.0, 0.0, nan, nan, nan], Przewidziane: 0, Wartość dokładna: 0.0
Baza=[0.1316, 1.0, 0.6667, 0.2593, 0.4112, 0.0, 0.0, 0.9537, 0.0, 0.0, nan, nan, nan], Przewidziane: 0, Wartość dokładna: 0.0
Baza=[0.5, 0.0, 0.6667, 0.3981, 0.3147, 1.0, 0.0, 0.8148, 0.0, 0.0, nan, nan, nan], Przewidziane: 0, Wartość dokładna: 1.0
Baza=[0.4737, 1.0, 0.6667, 0.2593, 0.2799, 0.0, 0.0, 0.6296, 0.0, 0.0, nan, nan, nan], Przewidziane: 0, Wart