In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import seaborn as sns
import numpy as np
import random as rn

data = pd.read_csv("btc_data.csv")

# Process the columns
data['Price'] = data['Price'].str.replace(',', '').astype(float)
data['Open'] = data['Open'].str.replace(',', '').astype(float)
data['High'] = data['High'].str.replace(',', '').astype(float)
data['Low'] = data['Low'].str.replace(',', '').astype(float)
data['Change %'] = data['Change %'].str.rstrip('%').astype(float)

def trans_vol(volume):
    if volume == '-':
        return 0 
    elif volume.endswith('K'):
        return float(volume.replace('K', '')) * 1e3
    elif volume.endswith('M'):
        return float(volume.replace('M', '')) * 1e6
    elif volume.endswith('B'):
        return float(volume.replace('B', '')) * 1e9
    else:
        return float(volume)
    
data['Vol.'] = data['Vol.'].apply(trans_vol)
data['Date'] = pd.to_datetime(data['Date'])
data = data.drop('Date', axis=1)
data = data.dropna()

def categorize_price_change(change_percentage):
    if change_percentage > 0 and change_percentage <= 1:
        return '+0-1%' 
    elif change_percentage < 0 and change_percentage >= -1:
        return '-0-1%' 
    elif change_percentage > 1 and change_percentage <= 3:
        return '>+1%'  
    elif change_percentage < -1 and change_percentage >= -3:
        return '<-1%'  
    elif change_percentage > 3:
        return '>+3%'  
    else:
        return '<-3%' 

data['Price Range'] = data['Change %'].apply(categorize_price_change)
print(data.head())

class DataProcessing:
    @staticmethod
    def shuffle(dataset):
        for i in range(len(dataset)-1, 0, -1):
            j = rn.randint(0,i-1)
            dataset.iloc[i], dataset.iloc[j] = dataset.iloc[j], dataset.iloc[i]
    @staticmethod
    def NormalizeData(dataset):
        values = dataset.select_dtypes(exclude='object')
        columnNames = values.columns.tolist()
        for column in columnNames:
            data = dataset[column]
            min_val = data.min()
            max_val = data.max()
            dataset[column] = (data - min_val) / (max_val - min_val)
    @staticmethod
    def split(dataset, k):
        return dataset[:int(len(dataset)*k)], dataset[int(len(dataset)*k):]

DataProcessing.shuffle(data)
trainingSet, validatingSet = DataProcessing.split(data, 0.6)

DataProcessing.NormalizeData(data)
trainingSetNormalized, validatingSetNormalized = DataProcessing.split(data, 0.6)

class NaiveBayes:
    @staticmethod
    def mean(atr):
        return sum(atr) / len(atr)
    
    @staticmethod
    def stdev(atr, mean):
        stdev = 0
        for i in atr:
            stdev += (i - mean) ** 2
        stdev = stdev / len(atr)
        stdev = np.sqrt(stdev)
        return stdev
    
    @staticmethod
    def triangular(x, mean, std):
        if x < mean - np.sqrt(6) * std or x > mean + np.sqrt(6) * std:
            return 0
        elif mean - np.sqrt(6) * std <= x <= mean:
            return (x - mean) / (6 * std ** 2) + 1 / (np.sqrt(6) * std)
        else:  # mean <= x <= mean + np.sqrt(6) * std
            return -(x - mean) / (6 * std ** 2) + 1 / (np.sqrt(6) * std)

    @staticmethod
    def calculate_class_probability(tmp, sample):
        prob = 1
        for columnName in tmp.columns.tolist()[:-1]:
            data = tmp[columnName]
            mu = NaiveBayes.mean(data)
            sigma = NaiveBayes.stdev(data, mu)
            prob *= NaiveBayes.triangular(sample[columnName], mu, sigma)
        return prob

    @staticmethod
    def classify(dataset, sample):
        probability = []
        classNames = dataset['Price Range'].unique().tolist()
        for className in classNames:
            tmp = dataset[dataset['Price Range'] == className]
            prob = NaiveBayes.calculate_class_probability(tmp, sample)
            prob *= len(tmp) / len(dataset)
            probability.append(prob)
        maxProb = max(probability)
        return classNames[probability.index(maxProb)]

counter = 0
for x in range(len(validatingSetNormalized)):
    result = NaiveBayes.classify(trainingSetNormalized, validatingSetNormalized.iloc[x])
    actual_class = validatingSetNormalized.iloc[x]['Price Range']
    if result == actual_class:
        counter += 1
accuracy = (counter/len(validatingSetNormalized))*100
print("Dokładność dla danych znormalizowanych wynosi", accuracy, "%.")

counter2 = 0
for x in range(len(validatingSet)):
    result2 = NaiveBayes.classify(trainingSet, validatingSet.iloc[x])
    if result2 == validatingSet.iloc[x]['Price Range']:
        counter2 += 1
accuracy2 = (counter2/len(validatingSet))*100
print("Dokładność dla danych nieznormalizowanych wynosi", accuracy2, "%.")

     Price     Open     High      Low     Vol.  Change % Price Range
0  47545.4  45293.3  47710.2  45254.2  86850.0      4.97        >+3%
1  45293.3  44346.2  45579.2  44336.4  66380.0      2.15        >+1%
2  44339.8  43088.4  44367.9  42783.5  48570.0      2.91        >+1%
3  43087.7  42697.6  43375.5  42566.8  33320.0      0.91       +0-1%
4  42697.2  42581.4  43532.2  42272.5  39260.0      0.27       +0-1%
Dokładność dla danych znormalizowanych wynosi 66.39757820383451 %.
Dokładność dla danych nieznormalizowanych wynosi 66.39757820383451 %.


In [91]:
import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X.values]
        return predictions

    def _predict(self, x):
        # compute the distance
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train.values]
    
        # get the closest k
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices if i < len(self.y_train)]

        # majority voye
        most_common = Counter(k_nearest_labels).most_common()
        return most_common[0][0]

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

df = pd.DataFrame(data, columns=['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %', 'Price Range'])

X = df.drop(columns=['Price Range'])
y = df['Price Range']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

clf = KNN(k=5)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

acc = np.sum(predictions == y_test) / len(y_test)
print(acc)

0.7991927346115035
