In [44]:
import pandas as pd
import numpy as np
import plotly.express as px
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [45]:
# Define the date range from 2024-01-01 to 2024-08-31
date_range = pd.date_range(start='2024-01-01', end='2024-08-31')

# Initialize random seed for reproducibility
np.random.seed(42)

# Generate initial random data
certificates_issued = np.random.poisson(lam=50, size=len(date_range))

# Introduce variability
for i in range(1, len(certificates_issued)):
    fluctuation = np.random.normal(loc=0, scale=10)
    if np.random.rand() > 0.95:
        fluctuation += np.random.randint(30, 60)
    elif np.random.rand() < 0.05:
        fluctuation -= np.random.randint(20, 40)
    certificates_issued[i] = max(0, certificates_issued[i-1] + fluctuation)

# Create DataFrame
df_certificates = pd.DataFrame({
    'Date': date_range,
    'Certificates_Issued': certificates_issued
})

# Show the size of the data
print(f"Size of the data: {df_certificates.shape}")


Size of the data: (244, 2)


In [46]:
# Adding more lag features
df_certificates['Previous_Week_Issued'] = df_certificates['Certificates_Issued'].shift(7)
df_certificates['Previous_Month_Issued'] = df_certificates['Certificates_Issued'].shift(30)

# Fill missing values
df_certificates.fillna(method='bfill', inplace=True)

# Fill any missing values caused by the shift
df_certificates.fillna(method='bfill', inplace=True)

# Split data into features (X) and target (y)
X = df_certificates[['Previous_Day_Issued']]
y = df_certificates['Certificates_Issued']

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



KeyError: "None of [Index(['Previous_Day_Issued'], dtype='object')] are in the [columns]"

In [48]:
# Line Plot
fig_line = px.line(df_certificates, x='Date', y='Certificates_Issued',
                   title='Simulated Certificates Issued per Day (2024)',
                   labels={'Date': 'Date', 'Certificates_Issued': 'Certificates Issued'})
fig_line.show()

# Ensure 'Date' column is in datetime format
df_certificates['Date'] = pd.to_datetime(df_certificates['Date'])

# Set 'Date' as the index
df_certificates.set_index('Date', inplace=True)

# Resample the data by month and sum the 'Certificates_Issued'
monthly_data = df_certificates.resample('M').sum().reset_index()


# Import Plotly Express if not already imported
import plotly.express as px

# Create the bar chart
fig_monthly = px.bar(monthly_data, x='Date', y='Certificates_Issued',
                    title='Monthly Certificates Issued (2024)',
                    labels={'Date': 'Month', 'Certificates_Issued': 'Certificates Issued'},
                    text='Certificates_Issued')

# Update layout to show text labels on bars
fig_monthly.update_traces(texttemplate='%{text:.0f}', textposition='outside')

# Show the bar chart
fig_monthly.show()



ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['Certificates_Issued', 'Previous_Week_Issued', 'Previous_Month_Issued'] but received: Date

In [None]:
# Create lag features for prediction
df_certificates['Previous_Month_Issued'] = df_certificates['Certificates_Issued'].shift(30)

# Fill missing values
df_certificates.fillna(method='bfill', inplace=True)

# Split data into features and target
X = df_certificates[['Previous_Month_Issued']]
y = df_certificates['Certificates_Issued']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



'scaler = StandardScaler()\nX_train_scaled = scaler.fit_transform(X_train)\nX_test_scaled = scaler.transform(X_test)'

In [None]:
# Initialize and train the XGBoost model
model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=5)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 29761.814566849982


In [None]:
# Prepare the last month's data for prediction
last_month_data = np.array([[df_certificates['Certificates_Issued'].iloc[-30]]])
last_month_data_scaled = scaler.transform(last_month_data)

# Predict for September 2024
predicted_september = model.predict(last_month_data_scaled)
print(f'Predicted Certificates Issued for September 2024: {predicted_september[0]}')


Predicted Certificates Issued for September 2024: 245.6870574951172



X does not have valid feature names, but StandardScaler was fitted with feature names

