In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from prettytable import PrettyTable
import datetime
import numpy as np

In [2]:
def train_and_check(model, Xtrain, Xtest, ytrain, ytest):
    classifier = model
    start = datetime.datetime.now()
    classifier.fit(Xtrain, ytrain)
    end = datetime.datetime.now()
    time = (end - start).microseconds
    evaluation = np.round(classifier.score(Xtest, ytest), 4)
    return evaluation, time

In [14]:
data = pd.read_csv('daily-bike-share.csv', parse_dates=['dteday'])
data.drop(columns=['instant', 'dteday', 'yr'], inplace=True)
y = data['rentals']
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season','mnth','holiday','weekday','workingday','weathersit']

data['difference_temp'] = (data['atemp'] - data['temp']) / data['temp']
data.drop(columns=['atemp'], axis=1, inplace=True)
numeric_features = ['temp', 'difference_temp', 'hum', 'windspeed']

X = pd.get_dummies(data[numeric_features + categorical_features])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
print(f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}")
print(f"X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

y_test.value_counts()

knn = KNeighborsRegressor(n_neighbors=1)

results = PrettyTable(['Model',
                       'Dokładność',
                       'Czas trenowania (microseconds)'])

# Trenowanie modelu na nieprzetworzonym zbiorze
not_scaled_data = train_and_check(knn, X_train, X_test, y_train, y_test)
results.add_row(['Nieskalowane dane - KNN', not_scaled_data[0], not_scaled_data[1]])

# Trenowanie modelu na przetworzonym zbiorze
scaled_data = train_and_check(knn, X_train_scaled, X_test_scaled, y_train, y_test)
results.add_row(['Skalowane dane - KNN', scaled_data[0], scaled_data[1]])

# Trenowanie modelu na czterech Głównych Składowych
PC9_data = train_and_check(knn, X_train_pca[:,:9], X_test_pca[:,:9], y_train, y_test)
results.add_row(['9 PC - KNN', PC9_data[0], PC9_data[1]])

# Trenowanie modelu na czterech Głównych Składowych
PC4_data = train_and_check(knn, X_train_pca[:,:8], X_test_pca[:,:8], y_train, y_test)
results.add_row(['8 PC - KNN', PC4_data[0], PC4_data[1]])

# Trenowanie modelu na czterech Głównych Składowych
PC4_data = train_and_check(knn, X_train_pca[:,:7], X_test_pca[:,:7], y_train, y_test)
results.add_row(['7 PC - KNN', PC4_data[0], PC4_data[1]])

# Trenowanie modelu na czterech Głównych Składowych
PC4_data = train_and_check(knn, X_train_pca[:,:6], X_test_pca[:,:6], y_train, y_test)
results.add_row(['6 PC - KNN', PC4_data[0], PC4_data[1]])

# Trenowanie modelu na czterech Głównych Składowych
PC4_data = train_and_check(knn, X_train_pca[:,:5], X_test_pca[:,:5], y_train, y_test)
results.add_row(['5 PC - KNN', PC4_data[0], PC4_data[1]])

# Trenowanie modelu na czterech Głównych Składowych
PC4_data = train_and_check(knn, X_train_pca[:,:4], X_test_pca[:,:4], y_train, y_test)
results.add_row(['4 PC - KNN', PC4_data[0], PC4_data[1]])

# Trenowanie modelu na trzech Głównych Składowych
PC3_data = train_and_check(knn, X_train_pca[:, :3], X_test_pca[:, :3], y_train, y_test)
results.add_row(['3 PC - KNN', PC3_data[0], PC3_data[1]])

# Trenowanie modelu na dwóch Głównych Składowych
PC2_data = train_and_check(knn, X_train_pca[:, :2], X_test_pca[:, :2], y_train, y_test)
results.add_row(['2 PC - KNN', PC2_data[0], PC2_data[1]])

# Trenowanie modelu na jednej Głównej Składowej
PC1_data = train_and_check(knn, X_train_pca[:, :1], X_test_pca[:, :1],  y_train, y_test)
results.add_row(['1 PC - KNN', PC1_data[0], PC1_data[1]])
print(results)




X_train.shape: (584, 10), y_train.shape: (584,)
X_test.shape: (147, 10), y_test.shape: (147,)
+-------------------------+------------+--------------------------------+
|          Model          | Dokładność | Czas trenowania (microseconds) |
+-------------------------+------------+--------------------------------+
| Nieskalowane dane - KNN |   0.504    |              3091              |
|   Skalowane dane - KNN  |   0.6026   |              998               |
|        9 PC - KNN       |   0.5949   |              1947              |
|        8 PC - KNN       |   0.5066   |              1994              |
|        7 PC - KNN       |   0.4901   |              1957              |
|        6 PC - KNN       |   0.4856   |              997               |
|        5 PC - KNN       |   0.2237   |               0                |
|        4 PC - KNN       |   0.2846   |               0                |
|        3 PC - KNN       |   -0.164   |               0                |
|        2 PC - KN