In [None]:
# Importing necessary modules
from tools import DataTools
# Importing necessary modules
from data import Data
# Importing necessary modules
from datetime import datetime
# Importing necessary modules
import pandas as pd
# Importing necessary modules
from sklearn.linear_model import LinearRegression
# Importing necessary modules
import matplotlib.pyplot as plt
# Importing necessary modules
import seaborn as sns
# Importing necessary modules
import gc

## 🕒 Step 1: Define the Date for Analysis

In [None]:
specified_time = "2022-11-29 10:00"
# Convert string to datetime object
specified_time = datetime.strptime(specified_time, "%Y-%m-%d %H:%M")
# Format the datetime object to extract the year
formatted_time = specified_time.strftime("%Y")

## 📂 Step 2: Load Station Data

In [None]:
path = r"..\data_files"
data_directory = r"..\data_files\data"
correct_header_data_bikes = ["city", "id", "request_date", "datetime", "bikes"]

# Open and read data files from the specified directory
station_data = DataTools.open_files_in_directory(path, "bike_station", "\t")
bike_station = Data()
# Load the opened data into a Data object
bike_station.get_data(station_data)
# Filter the data by city or station name
bike_station.filter_dataframes("city", ["amiens", "marseille"])
del station_data

## 🧪 Step 3: Load Pollution and Weather Station Data

In [None]:
# Open and read data files from the specified directory
station_data = DataTools.open_files_in_directory(path, "pollution_station", ",")
pollution_station = Data()
# Load the opened data into a Data object
pollution_station.get_data(station_data)
# Filter the data by city or station name
pollution_station.filter_dataframes("city", ["amiens", "marseille"])
del station_data

# Open and read data files from the specified directory
weather_data = DataTools.open_files_in_directory(data_directory, f"weather_{formatted_time}", ",")
data_weather = Data()
# Load the opened data into a Data object
data_weather.get_data(weather_data)
# Filter the data by city or station name
data_weather.filter_dataframes("name", ["Amiens", "Marseille"])
del weather_data

# Open and read data files from the specified directory
pollution_data = DataTools.open_files_in_directory(data_directory, f"pollution_{formatted_time}", ",")
data_pollution = Data()
# Load the opened data into a Data object
data_pollution.get_data(pollution_data)
# Filter the data by city or station name
data_pollution.filter_dataframes("name", ["Amiens"])
del pollution_data

## 🚲 Step 4: Load and Merge Bike Data

In [None]:
# Open and read data files from the specified directory
bike_data = DataTools.open_files_in_directory(data_directory, f"bike_{formatted_time}", "\t")
# Standardize column headers in the bike data
bike_data = DataTools.rename_header(bike_data, correct_header_data_bikes, keep_old_header=True)

data_bike = Data()
# Load the opened data into a Data object
data_bike.get_data(bike_data)
# Filter the data by city or station name
data_bike.filter_dataframes("city", ["amiens", "marseille"])
del bike_data

# Merge additional information (e.g., location, capacity) into bike or pollution data
data_bike.data = DataTools.merge_dataframes(
    data_bike.data, bike_station.data, "id", "id", ["bike_stands", "latitude", "longitude", "id_pollution"]
)

# Merge additional information (e.g., location, capacity) into bike or pollution data
data_pollution.data = DataTools.merge_dataframes(
    data_pollution.data, pollution_station.data, "id", "id", ["latitude", "longitude"]
)

del pollution_station
del bike_station

## 📊 Step 5: Calculate Bike Usage Capacity and Statistics

In [None]:
# Calculate the total number of bike slots available in the specified city
bike_count_amiens = DataTools.calul_capacity(data_bike.data, "amiens")
# Calculate the total number of bike slots available in the specified city
bike_count_marseille = DataTools.calul_capacity(data_bike.data, "marseille")

print(f"Number of slots in Amiens: {bike_count_amiens}")
print(f"Number of slots in Marseille: {bike_count_marseille}")

# Compute bike usage statistics such as daily and hourly usage
dailyuse_amiens, period_use_amiens, useperhour_amiens = DataTools.calculate_use(
    data_bike.data[data_bike.data["city"] == "amiens"]
)
# Compute bike usage statistics such as daily and hourly usage
dailyuse_marseille, period_use_marseille, useperhour_marseille = DataTools.calculate_use(
    data_bike.data[data_bike.data["city"] == "marseille"]
)

## 🔍 Step 6: Perform Correlation Analysis

In [None]:
# Perform correlation analysis between bike usage and environmental factors
DataTools.corr_analysis(
    [dailyuse_amiens, data_weather.data],
    ["total_bikes_used", ["temp", "temp_max", "temp_min", "humidity", "speed", "clouds"]],
)

# Perform correlation analysis between bike usage and environmental factors
DataTools.corr_analysis(
    [dailyuse_amiens, data_pollution.data],
    ["total_bikes_used", ["NO", "NO2", "NOX as NO2", "O3", "PM10", "PM2.5"]],
)

# Perform correlation analysis between bike usage and environmental factors
DataTools.corr_analysis(
    [dailyuse_marseille, data_weather.data],
    ["total_bikes_used", ["temp", "temp_max", "temp_min", "humidity", "speed", "clouds"]],
)

# Perform correlation analysis between bike usage and environmental factors
DataTools.corr_analysis(
    [dailyuse_marseille, data_pollution.data],
    ["total_bikes_used", ["NO", "NO2", "NOX as NO2", "O3", "PM10", "PM2.5"]],
)

In [None]:
results = DataTools.predict_bike_usage(
    usage_data=dailyuse_amiens, 
    weather_data=data_weather.data,
    pollution_data=data_pollution.data,
    threshold=100 # Optional: value above which usage is considered "high"
)
# After calling predict_bike_usage, add the following code to use the results:
# Check if the prediction was successful
if results and 'error' not in results:
    print("\n===== USING PREDICTION RESULTS =====")
    
    # 1. Access the regression model metrics
    if 'regression' in results:
        reg_metrics = results['regression']
        print(f"Regression model R² score: {reg_metrics['r2']:.4f}")
        print(f"RMSE: {reg_metrics['rmse']:.2f} bikes")
        
        # Make a prediction with the regression model for a new data point
        # Example: predict bike usage for a specific set of features
        new_data = pd.DataFrame({
            'day_of_week': [0],  # Monday
            'month': [6],        # June
            'is_weekend': [0],   # Not weekend
            'temp': [25],        # 25°C
            'humidity': [50]     # 50% humidity
            # Add other features as needed
        })
        
        # Make sure to use only the features that the model was trained on
        missing_cols = set(reg_metrics['features']) - set(new_data.columns)
        for col in missing_cols:
            new_data[col] = 0  # Fill missing columns with default values
            
        new_data = new_data[reg_metrics['features']]  # Reorder columns
        
        # Scale the data using the same scaler used for training
        scaled_data = reg_metrics['scaler'].transform(new_data)
        
        # Make prediction
        predicted_usage = reg_metrics['model'].predict(scaled_data)[0]
        print(f"Predicted bike usage for a 25°C Monday in June: {predicted_usage:.0f} bikes")
    
    # 2. Use the classification model for high/low usage prediction
    if 'classification' in results:
        print("\nBinary classification performance:")
        print(f"Accuracy: {results['classification']['accuracy']:.2f}")
        
        # You could predict if usage will be high or low for new data
        if reg_metrics and 'model' in results['classification']:
            high_usage = results['classification']['model'].predict(scaled_data)[0]
            print(f"High usage day? {'Yes' if high_usage else 'No'}")
    
    # 3. Use the category classification model
    if 'category_classification' in results:
        print("\nCategory classification performance:")
        print(f"Accuracy: {results['category_classification']['accuracy']:.2f}")
        
        # Predict usage category (low/medium/high) for new data
        if reg_metrics and 'model' in results['category_classification']:
            category = results['category_classification']['model'].predict(scaled_data)[0]
            print(f"Usage category: {category}")
    
    # 4. You could save the models for future use
# Importing necessary modules
    import pickle
    
    # Save regression model
    if 'regression' in results:
        with open('bike_usage_regression_model.pkl', 'wb') as f:
            pickle.dump({
                'model': results['regression']['model'],
                'scaler': results['regression']['scaler'],
                'features': results['regression']['features']
            }, f)
        print("\nRegression model saved to 'bike_usage_regression_model.pkl'")
        
    print("\n===== END OF RESULTS USAGE =====")
else:
    print("Prediction failed or returned no results.")

## ✅ Final Step: Clean Up

In [None]:
# Explicitly free memory
gc.collect()
print("Program completed successfully. Memory freed.")