In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
file_path = 'training_data.csv'
data = pd.read_csv(file_path)
data['increase_stock_binary'] = data['increase_stock'].apply(lambda x: 1 if x == 'high_bike_demand' else 0)
# Separating features and the target variable


In [3]:
from sklearn.preprocessing import MinMaxScaler

# Helper function to convert to cyclical features
def encode_cyclical(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

# Convert 'hour_of_day' into cyclical features
data = encode_cyclical(data, 'hour_of_day', 24)

# Create interaction term for 'day_of_week' and 'hour_of_day'
data['day_hour_interaction'] = data['day_of_week'] * data['hour_of_day']

# Normalize the 'temp', 'humidity', and 'windspeed' features to create a 'weather_score'
scaler = MinMaxScaler()
data[['temp_norm', 'humidity_norm', 'windspeed_norm','snow_norm','snowdepth_norm','summertime_norm','dew_norm','precip_norm','cloudcover_norm','visibility_norm']] = scaler.fit_transform(data[['temp', 'humidity', 'windspeed','snow','snowdepth','summertime','dew','precip','cloudcover','visibility']])
data['seasonal_factor'] = data['summertime_norm'] - (data['snow_norm'] + data['snowdepth_norm'])
# Create a 'weather_score' feature that combines the normalized values of 'temp', 'humidity', and 'windspeed'
data['weather_score'] = data['temp_norm'] * 0.4 + data['humidity_norm'] * 0.3 + data['windspeed_norm'] * 0.3

# Create a special day feature by combining 'month', 'holiday', and 'weekday'
# Assuming the impact of the holiday is higher than the other features
data['special_day'] = data['month'] / 12 + data['holiday'] + data['weekday'] / 7

# Drop the original columns that we have encoded or used to create new features
data.drop(columns=['hour_of_day', 'day_of_week', 'temp', 'humidity', 'windspeed', 'month', 'holiday', 'weekday','snow','snowdepth','summertime','snow_norm','snowdepth_norm','summertime_norm'], inplace=True)

# Show the modified dataframe with the new features
data.head()

Unnamed: 0,dew,precip,cloudcover,visibility,increase_stock,increase_stock_binary,hour_of_day_sin,hour_of_day_cos,day_hour_interaction,temp_norm,humidity_norm,windspeed_norm,dew_norm,precip_norm,cloudcover_norm,visibility_norm,seasonal_factor,weather_score,special_day
0,-15.0,0.0,31.6,16.0,low_bike_demand,0,0.965926,0.258819,25,0.042506,0.450143,0.372146,0.079625,0.0,0.316,1.0,0.0,0.263689,0.083333
1,-12.8,0.0,85.7,16.0,low_bike_demand,0,-0.707107,0.707107,84,0.174497,0.298905,0.545662,0.131148,0.0,0.857,1.0,0.0,0.323169,0.22619
2,21.8,0.0,81.1,16.0,low_bike_demand,0,-0.707107,0.707107,63,0.805369,0.684674,0.0,0.941452,0.0,0.811,1.0,1.0,0.52755,0.809524
3,-4.0,0.0,0.0,16.0,low_bike_demand,0,0.258819,0.965926,6,0.272931,0.522251,0.438356,0.337237,0.0,0.0,1.0,0.0,0.397355,0.083333
4,-11.4,0.0,44.6,16.0,low_bike_demand,0,-0.965926,-0.258819,0,0.465324,0.034031,0.239726,0.163934,0.0,0.446,1.0,0.0,0.268257,0.392857


In [4]:
# X = data.drop(['increase_stock', 'increase_stock_binary'], axis=1)
# y = data['increase_stock_binary']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
 # 将数据集分割为特征和目标
X = data.drop(['increase_stock','increase_stock_binary','dew','precip','cloudcover','visibility'], axis=1)
y = data['increase_stock_binary']

 # 将数据分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [5]:
# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Creating a KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)
pipeline = make_pipeline(StandardScaler(), knn)
# Fitting the classifier to the training data
knn.fit(X_train, y_train)

# Making predictions on the test data
y_pred = knn.predict(X_test)
# 进行交叉验证
# cv参数代表交叉验证中的折数
scores = cross_val_score(pipeline, X_test, y_test, cv=10)
# Generating a classification report and confusion matrix
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)
accuracy_score_data = accuracy_score(y_test,y_pred)
# Printing the results
print("Classification Report:\n", classification_report_result)
#print("Confusion Matrix:\n", confusion_matrix_result)
print("accuracy_score:\n",accuracy_score_data)

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93       270
           1       0.74      0.34      0.47        50

    accuracy                           0.88       320
   macro avg       0.81      0.66      0.70       320
weighted avg       0.87      0.88      0.86       320

accuracy_score:
 0.878125


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# 编码目标变量
le = LabelEncoder()
data['increase_stock'] = le.fit_transform(data['increase_stock'])

 # 将数据集分割为特征和目标
X = data.drop(['increase_stock','increase_stock_binary','dew','precip','cloudcover','visibility'], axis=1)
y = data['increase_stock_binary']

 # 将数据分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [7]:
X

Unnamed: 0,hour_of_day_sin,hour_of_day_cos,day_hour_interaction,temp_norm,humidity_norm,windspeed_norm,dew_norm,precip_norm,cloudcover_norm,visibility_norm,seasonal_factor,weather_score,special_day
0,0.965926,0.258819,25,0.042506,0.450143,0.372146,0.079625,0.000000,0.316,1.000000,0.000000,0.263689,0.083333
1,-0.707107,0.707107,84,0.174497,0.298905,0.545662,0.131148,0.000000,0.857,1.000000,0.000000,0.323169,0.226190
2,-0.707107,0.707107,63,0.805369,0.684674,0.000000,0.941452,0.000000,0.811,1.000000,1.000000,0.527550,0.809524
3,0.258819,0.965926,6,0.272931,0.522251,0.438356,0.337237,0.000000,0.000,1.000000,0.000000,0.397355,0.083333
4,-0.965926,-0.258819,0,0.465324,0.034031,0.239726,0.163934,0.000000,0.446,1.000000,0.000000,0.268257,0.392857
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.707107,0.707107,15,0.684564,0.854712,0.242009,0.885246,0.000000,0.244,1.000000,1.000000,0.602842,0.500000
1596,-0.500000,-0.866025,0,0.722595,0.792242,0.223744,0.901639,0.085694,0.921,0.647799,1.000000,0.593834,0.642857
1597,-0.258819,-0.965926,0,0.514541,0.203237,0.415525,0.379391,0.000000,0.793,1.000000,0.701937,0.391445,0.392857
1598,-0.500000,-0.866025,70,0.465324,0.074250,0.132420,0.213115,0.000000,0.244,1.000000,1.000000,0.248131,0.250000


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import numpy as np

# 定义一个简单的深度神经网络
class SimpleDNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleDNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# 神经网络参数
input_size = X_train.shape[1]
hidden_size = 128
num_classes = len(np.unique(y_train))
num_epochs = 200
batch_size = 64
learning_rate = 0.001

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 转换为 PyTorch tensors
X_train_torch = torch.tensor(X_train_scaled.astype(np.float32))
y_train_torch = torch.tensor(y_train.values.astype(np.int64))
X_test_torch = torch.tensor(X_test_scaled.astype(np.float32))
y_test_torch = torch.tensor(y_test.values.astype(np.int64))

# 加载数据
train_dataset = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# 初始化网络和优化器
model = SimpleDNN(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 训练模型
model.train()
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# 评估模型性能
model.eval()
outputs = model(X_test_torch)
_, predicted = torch.max(outputs.data, 1)
accuracy_dnn = (predicted == y_test_torch).sum().item() / y_test_torch.size(0)


In [9]:
accuracy_dnn

0.895