In [None]:
!pip install pygam

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline

from pygam import LinearGAM, s, f

# Data Loading and Merging
Loading all CSV files generated from Data Extraction step

In [None]:
dates = [
'2022-01-31',
'2022-02-28',
'2022-03-31',
'2022-04-30',
'2022-05-31',
'2022-06-30',
'2022-07-31',
'2022-08-31',
'2022-09-30',
'2022-10-31',
'2022-11-30',
'2022-12-31'
]

s3dfs = []
for index in tqdm(range(0, len(dates), 1), desc="Processing dates"):
    df = pd.read_csv(f'./traffic_data/traffic_data_{dates[index]}.csv')
    s3dfs.append(df)

In [None]:
s3df = pd.concat(s3dfs, ignore_index = True)
s3df.head(5)

In [None]:
len(s3df)

In [None]:
s3df.dtypes

In [None]:
# Converting datetime to date
s3df['datetime'] = pd.to_datetime(s3df['datetime'])
s3df['datetime'] = s3df['datetime'].dt.date

# Data Aggregation
The goal is to group the data by `container_group`, `datetime`, and `disk_capacity_tb` to understand the distribution and count of `chunk_id` over different periods and configurations. This would give us the number of transactions which were recorded in the past and later can be used for predicting future

In [None]:
s3df_agg = s3df.groupby(['container_group', 'datetime', 'disk_capacity_tb'])['chunk_id'].count()

In [None]:
df = s3df_agg.reset_index(name='count')
df['datetime'] = pd.to_datetime(df['datetime'])
df.head(5)

In [None]:
df.dtypes

# EDA

In [None]:
# Descriptive statistics
print(df.describe())

# Univariate Analysis: Histograms for numerical features
df.hist(bins=15, figsize=(15, 10))
plt.show()

# Bivariate Analysis: Scatter plots for numerical features against the target variable
numeric_columns = df.select_dtypes(include=['number']).columns
for column in numeric_columns:
    if column != 'count':
        df.plot(kind='scatter', x=column, y='count')
        plt.show()
        
plt.plot(df['datetime'], df['count'], 'o', color='black')


# Outlier Detection using IQR
Q1 = df[numeric_columns].quantile(0.25)
Q3 = df[numeric_columns].quantile(0.75)
IQR = Q3 - Q1
outlier_condition = (df[numeric_columns] < (Q1 - 1.5 * IQR)) | (df[numeric_columns] > (Q3 + 1.5 * IQR))
outliers = df[outlier_condition.any(axis=1)]
print(f"Number of outliers detected: {outliers.shape[0]}")

In [None]:
# Using boxplot to visualize outliers
sns.boxplot(x=df['count'])
plt.show()

In [None]:
class DataModel:
    def __init__(self, model_type='xgboost'):
        self.model_type = model_type
        if model_type == 'random_forest':
            self.model = RandomForestRegressor(n_estimators=100, random_state=42)
        elif model_type == 'lasso':
            self.model = make_pipeline(StandardScaler(), Lasso(alpha=0.1, random_state=42))
        elif model_type == 'gam':
            self.model = LinearGAM()
        else:
            self.model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
        self.label_encoder = LabelEncoder()

    def preprocess(self, df):
        xdf = df.copy(deep=True)
        xdf = xdf.sort_values(by=['container_group', 'disk_capacity_tb', 'datetime'])
        
        # Encoding container_group
        xdf['container_group_encoded'] = self.label_encoder.fit_transform(xdf['container_group'])

        if self.model_type == 'gam':
            xdf['timestamp'] = xdf['datetime'].apply(lambda x: x.timestamp())

        # Extracting datetime features
        xdf['month'] = xdf['datetime'].dt.month
        xdf['day'] = xdf['datetime'].dt.day

        # Cyclical encoding of day and month
        xdf['day_sin'] = np.sin(xdf['day'] * (2. * np.pi / xdf['day'].max()))
        xdf['day_cos'] = np.cos(xdf['day'] * (2. * np.pi / xdf['day'].max()))
        xdf['month_sin'] = np.sin((xdf['month'] - 1) * (2. * np.pi / 12))
        xdf['month_cos'] = np.cos((xdf['month'] - 1) * (2. * np.pi / 12))

        # Dropping original columns
        xdf.drop(['datetime', 'container_group'], axis=1, inplace=True)

        # Remove outliers
        z = np.abs(stats.zscore(xdf['count']))
        xdf = xdf[(z < 3)]

        # Create lagged features (shift the target variable 'y' back one time step)
        # xdf['y_lagged'] = xdf['count'].shift(1)

        # Create a rolling window feature (calculate the rolling mean of 'y' over the past 3 time steps)
        # xdf['y_rolling_mean'] = xdf['count'].rolling(window=3).mean()
        
        return xdf.dropna()

    def train(self, df):
        xdf = self.preprocess(df)
        X = xdf.drop('count', axis=1)
        y = xdf['count']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=9)
        
        if self.model_type == 'gam':
            self.model.gridsearch(X_train.values, y_train.values)
        else:    
            self.model.fit(X_train, y_train)

        # Model evaluation
        y_pred = self.model.predict(X_test)

        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        r2 = r2_score(y_test, y_pred)

        print(f'Mean Absolute Error (MAE): {mae}')
        print(f'Mean Squared Error (MSE): {mse}')
        print(f'Mean Absolute Percentage Error (MAPE): {mape}%')
        print(f'R-squared (R2): {r2}')

    def predict(self, day, month, disk_capacity, container_group):
        container_group_encoded = self.label_encoder.transform([container_group])[0]
        # Prepare input for prediction
        sample_input = pd.DataFrame({
            'disk_capacity_tb': [disk_capacity],
            'container_group_encoded': [container_group_encoded],
            'month': [month],
            'day': [day],
            'day_sin': [np.sin(day * (2. * np.pi / 31))],
            'day_cos': [np.cos(day * (2. * np.pi / 31))],
            'month_sin': [np.sin((month - 1) * (2. * np.pi / 12))],
            'month_cos': [np.cos((month - 1) * (2. * np.pi / 12))]
        })

        return int(self.model.predict(sample_input)[0])

    def save_model(self, filename='SamplePredictor.pickle'):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)


In [None]:
print('XGBoost')
xg_data_model = DataModel(model_type='xgboost')
xg_data_model.train(df)

print('\n\nRandom Forest')
rf_data_model = DataModel(model_type='random_forest')
rf_data_model.train(df)

print('\n\nLasso')
l_data_model = DataModel(model_type='lasso')
l_data_model.train(df)

print('\n\nGAM')
gam_data_model = DataModel(model_type='gam')
gam_data_model.train(df)

In [None]:
day = 15
month = 7
disk_capacity = 20
container_group = '02892102A8F17B5A551466B444222F4C3D9A399F'
print(f'For sample input day : {day}, month : {month}, disk capacity : {disk_capacity}, container group : {container_group}') 
print(rf_data_model.predict(day=day, month=month, disk_capacity=disk_capacity, container_group=container_group))

rf_data_model.save_model()

In [None]:
print(rf_data_model.predict(day=1, month=1, disk_capacity=16, container_group='02892102A8F17B5A551466B444222F4C3D9A399F'))

In [None]:
print(xg_data_model.predict(day=1, month=1, disk_capacity=16, container_group='02892102A8F17B5A551466B444222F4C3D9A399F'))

# Time Series Analysis : Prophet

In [None]:
!pip install prophet

In [None]:
import pandas as pd
from prophet import Prophet
from sklearn.preprocessing import OneHotEncoder

In [None]:
pdf = df.copy(deep=True)
pdf.rename(columns={'datetime': 'ds', 'count': 'y'}, inplace=True)

encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(pdf[['container_group', 'disk_capacity_tb']])
categorical_features_df = pd.DataFrame(categorical_features.toarray(), columns=encoder.get_feature_names_out(['container_group', 'disk_capacity_tb']))

pdf = pdf.join(categorical_features_df)

pdf.drop(['container_group', 'disk_capacity_tb'], axis=1, inplace=True)

In [None]:
cutoff_date = '2022-11-01'
train_df = pdf[pdf['ds'] < cutoff_date]
test_df = pdf[pdf['ds'] >= cutoff_date]

In [None]:
m = Prophet()

for column in categorical_features_df.columns:
    m.add_regressor(column)

m.fit(train_df)

In [None]:
future_dates = m.make_future_dataframe(periods=test_df.shape[0])

test_df_prepared = test_df[['ds'] + list(categorical_features_df.columns)]

forecast = m.predict(test_df_prepared)

In [None]:
fig1 = m.plot(forecast)

In [None]:
temp_test_df = test_df[['ds', 'y']]
forecast_df = forecast[['ds', 'yhat']]
temp_test_df['yhat'] = forecast['yhat'].values

In [None]:
# Calculate MSE
mse = mean_squared_error(temp_test_df['y'], temp_test_df['yhat'])
print(f'Mean Squared Error (MSE): {mse}')

# Calculate RMSE
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Calculate MAE
mae = mean_absolute_error(temp_test_df['y'], temp_test_df['yhat'])
print(f'Mean Absolute Error (MAE): {mae}')

# Calculate MAPE
mape = np.mean(np.abs((temp_test_df['y'] - temp_test_df['yhat']) / temp_test_df['y'])) * 100
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')