In [332]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

导入数据

In [333]:
# Load the dataset
file_path = 'training_data.csv'
data = pd.read_csv(file_path)
data['increase_stock_binary'] = data['increase_stock'].apply(lambda x: 1 if x == 'high_bike_demand' else 0)
# Separating features and the target variable


特征工程

In [334]:
#高峰时间段
def peak_hours(hour):
    return 1 if (7 <= hour <= 9) or (16 <= hour <= 18) else 0

data['peak_hour'] = data['hour_of_day'].apply(peak_hours)

In [335]:
#判断今天是不是工作日
def is_workday(row):
    if row['day_of_week'] >= 5:  # If the day is Saturday (5) or Sunday (6), it's not a workday
        return 0
    if row['holiday'] == 1:  # If it's a holiday, it's not a workday
        return 0
    return 1  # Otherwise, it's a workday

# Apply the function to the dataset
data['is_workday'] = data.apply(is_workday, axis=1)

In [336]:
#http://www.qxkp.net/qxbk/qxsy/202103/t20210301_2787280.html
def categorize_precipitation(mm):
    if mm < 10:
        return 1
    elif 10 <= mm < 25:
        return 2
    elif 25 <= mm < 50:
        return 3
    elif 50 <= mm < 100:
        return 4
    elif 100 <= mm < 250:
        return 5
    elif mm >= 250:
        return 6
    else:
        return 0

# Snowfall: Categorizing based on the description
def categorize_snow(snowfall, snowdepth):
    if snowfall == 0 and snowdepth == 0:
        return 0
    elif snowfall == 0 and snowdepth > 0:
        return 1
    elif snowfall > 0 and snowdepth == 0:
        return 2
    elif snowfall > 0 and snowdepth > 0:
        return 3
    else:
        return 4

# Apply the function to the rows of the dataframe
data['snow_condition'] = data.apply(lambda row: categorize_snow(row['snow'], row['snowdepth']), axis=1)

# Now let's drop the original 'snow' and 'snowdepth' columns

# Atmospheric pressure: No categories needed, it's a continuous variable

# Wind speed: The description is related to the Beaufort scale which is different from the wind speed.
# Hence, no categorization is needed. However, if there is a specific request to categorize wind speed based on
# a given scale, that could be done.

# Applying the categorization functions to the precipitation and snowfall columns
data['precip_category'] = data['precip'].apply(categorize_precipitation)

In [337]:
# #处理风速
# def categorize_wind_speed(windspeed):
#     if windspeed < 0.3:
#         return 0
#     elif windspeed < 1.6:
#         return 1
#     elif windspeed < 3.4:
#         return 2
#     elif windspeed < 5.5:
#         return 3
#     elif windspeed < 8.0:
#         return 4
#     elif windspeed < 10.8:
#         return 5
#     elif windspeed < 13.9:
#         return 6
#     elif windspeed < 17.2:
#         return 7
#     elif windspeed < 20.8:
#         return 8
#     elif windspeed < 24.5:
#         return 9
#     elif windspeed < 28.5:
#         return 10
#     elif windspeed < 32.7:
#         return 11
#     else:
#         return 12

# # Apply the function to the 'windspeed' column
# data['wind_level'] = data['windspeed'].apply(categorize_wind_speed)
apparent_temp = 1.07 * data['temp'] + 0.2 * data['dew'] - 0.65 * data['windspeed'] - 2.7

# Add the calculated apparent temperatures to the dataframe
data['apparent_temp'] = apparent_temp

In [338]:
#一年四季
# Function to categorize months into seasons
def categorize_season(month):
    if month in [12, 1, 2]:
        return 0
    elif month in [3, 4, 5]:
        return 1
    elif month in [6, 7, 8]:
        return 2
    else:  # months 9, 10, 11
        return 3

# Apply the function to categorize seasons
data['season'] = data['month'].apply(categorize_season)

# Merge 'season' and 'summertime' into a single column
# If summertime is 1, then it overrides the season to 'Summer'
data['season_summertime'] = data.apply(lambda row: 2 if row['summertime'] == 1 else row['season'], axis=1)

In [339]:
data

Unnamed: 0,hour_of_day,day_of_week,month,holiday,weekday,summertime,temp,dew,humidity,precip,...,visibility,increase_stock,increase_stock_binary,peak_hour,is_workday,snow_condition,precip_category,apparent_temp,season,season_summertime
0,5,5,1,0,0,0,-7.2,-15.0,53.68,0.000,...,16.0,low_bike_demand,0,0,0,0,1,-23.999,0,0
1,21,4,1,0,1,0,-1.3,-12.8,40.97,0.000,...,16.0,low_bike_demand,0,0,1,0,1,-22.186,0,0
2,21,3,8,0,1,1,26.9,21.8,73.39,0.000,...,16.0,low_bike_demand,0,0,1,0,1,30.443,2,2
3,1,6,1,0,0,0,3.1,-4.0,59.74,0.000,...,16.0,low_bike_demand,0,0,0,0,1,-12.663,0,0
4,17,0,3,0,1,0,11.7,-11.4,18.71,0.000,...,16.0,low_bike_demand,0,1,1,0,1,0.714,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,3,5,6,0,0,1,21.5,19.4,87.68,0.000,...,16.0,low_bike_demand,0,0,0,0,1,17.295,2,2
1596,14,0,6,0,1,1,23.2,20.1,82.43,2.217,...,10.4,low_bike_demand,0,0,1,0,1,19.774,2,2
1597,13,0,3,0,1,1,13.9,-2.2,32.93,0.000,...,16.0,low_bike_demand,0,0,1,1,1,-0.097,1,2
1598,14,5,3,0,0,1,11.7,-9.3,22.09,0.000,...,16.0,high_bike_demand,1,0,0,0,1,4.189,1,2


In [340]:
#去除掉已经合并好的特征列
data = data.drop(columns=['hour_of_day', 'day_of_week', 'holiday', 'weekday','precip','snow','snowdepth','month','summertime','temp','season']) # windspeed 非常重要

In [341]:
data

Unnamed: 0,dew,humidity,windspeed,cloudcover,visibility,increase_stock,increase_stock_binary,peak_hour,is_workday,snow_condition,precip_category,apparent_temp,season_summertime
0,-15.0,53.68,16.3,31.6,16.0,low_bike_demand,0,0,0,0,1,-23.999,0
1,-12.8,40.97,23.9,85.7,16.0,low_bike_demand,0,0,1,0,1,-22.186,0
2,21.8,73.39,0.0,81.1,16.0,low_bike_demand,0,0,1,0,1,30.443,2
3,-4.0,59.74,19.2,0.0,16.0,low_bike_demand,0,0,0,0,1,-12.663,0
4,-11.4,18.71,10.5,44.6,16.0,low_bike_demand,0,1,1,0,1,0.714,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,19.4,87.68,10.6,24.4,16.0,low_bike_demand,0,0,0,0,1,17.295,2
1596,20.1,82.43,9.8,92.1,10.4,low_bike_demand,0,0,1,0,1,19.774,2
1597,-2.2,32.93,18.2,79.3,16.0,low_bike_demand,0,0,1,1,1,-0.097,2
1598,-9.3,22.09,5.8,24.4,16.0,high_bike_demand,1,0,0,0,1,4.189,2


In [342]:
# X = data.drop(['increase_stock', 'increase_stock_binary'], axis=1)
# y = data['increase_stock_binary']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
 # 将数据集分割为特征和目标
X = data.drop(['increase_stock','increase_stock_binary'], axis=1)
y = data['increase_stock_binary']

 # 将数据分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0)

In [343]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Assuming X_train, X_test, y_train, y_test are already defined

# Creating a pipeline with a standard scaler and a KNN classifier
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())

# Defining the parameter grid: testing different values for n_neighbors
param_grid = {'kneighborsclassifier__n_neighbors': range(1, 31)}

# Creating a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Fitting grid_search on the training data
grid_search.fit(X_train, y_train)

# Best parameters found by grid search
print("Best parameters:", grid_search.best_params_)

# Using the best model to make predictions
y_pred = grid_search.predict(X_test)

# Generating a classification report, confusion matrix, and accuracy score
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)
accuracy_score_data = accuracy_score(y_test, y_pred)

# Printing the results
print("Classification Report:\n", classification_report_result)
print("Confusion Matrix:\n", confusion_matrix_result)
print("Accuracy Score:\n", accuracy_score_data)

# The cross-validation scores for the best model can also be accessed
print("Best Model Cross-Validation Scores:", grid_search.best_score_)


Best parameters: {'kneighborsclassifier__n_neighbors': 23}
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.91       324
           1       0.70      0.41      0.52        76

    accuracy                           0.85       400
   macro avg       0.79      0.68      0.72       400
weighted avg       0.84      0.85      0.84       400

Confusion Matrix:
 [[311  13]
 [ 45  31]]
Accuracy Score:
 0.855
Best Model Cross-Validation Scores: 0.8625
