## Gaussian Copula Synthesizer

In [None]:
!pip install pyathena

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor
import pandas as pd
import numpy as np

### Querying the S3 bucket

In [None]:
def run_athena_query(query, print_out=False):
    cursor = connect(
        region_name='us-west-2',
        work_group="primary",
        cursor_class=PandasCursor).cursor()

    df = cursor.execute(query).as_pandas()

    if print_out:
        print(df.to_markdown(index=False))

    return df

In [None]:
query = "select * from AwsDataCatalog.uwdatascience2023.full_harddrivetraffic limit 3000000"

In [None]:
athena_df = run_athena_query(query, print_out=False)

In [None]:
athena_df.head()

### Data cleaning and preprocessing

In [None]:
df = athena_df.sort_values(by=['chunk_id', 'timestamp_nano'])
df['container_group'] = df.groupby('chunk_id')['container_group'].ffill()
df['container_encoding'] = df.groupby('chunk_id')['container_encoding'].ffill()
df['chunk_size'] = df.groupby('chunk_id')['chunk_size'].ffill()
df.dropna(inplace=True)

In [None]:
df['datetime_column'] = pd.to_datetime(df['timestamp_nano'], unit='ns')
df.drop(columns=['timestamp_nano'], inplace=True)

In [None]:
df['chunk_size'] = df['chunk_size'].astype(int)

In [None]:
df.describe()

In [None]:
features = ['timestamp_nano', 'location_id', 'server_id', 'config_id', 'disk_id', 'container_id', 'container_group', 'container_encoding', 'operation', 'chunk_id', 'chunk_size']

In [None]:
train_percentage = 1
split_index = int(len(df) * train_percentage)
train_data = df.iloc[:split_index, :]

In [None]:
# Dropping month_end and datetime_column
columns_to_delete = ['month_end']
train_data.drop(columns=columns_to_delete, inplace=True)

In [None]:
# Grouping by 'container_group' and aggregating 'container_id' into a list
grouped_data = train_data.groupby('container_group')['container_id'].agg(list).reset_index()

# Creating a dictionary from the grouped data
container_dict = dict(zip(grouped_data['container_group'], grouped_data['container_id']))

In [None]:
# Extract only the date part and keep the column name and datatype unchanged
train_data['datetime_column'] = train_data['datetime_column'].dt.strftime('%Y-%m-%d')

# Convert the 'datetime_column' to datetime format
train_data['datetime_column'] = pd.to_datetime(train_data['datetime_column'])

In [None]:
train_data.head()

### Metadata analysis

In [None]:
!pip install sdv

In [None]:
from sdv.metadata import SingleTableMetadata

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(train_data)

In [None]:
metadata.update_column(
    column_name='container_id',
    sdtype='categorical')

In [None]:
metadata.update_column(
    column_name='chunk_id',
    sdtype='categorical')

In [None]:
metadata.update_column(
    column_name='disk_capacity_tb',
    sdtype='categorical')

In [None]:
metadata

### Raw synthetic data

In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

In [None]:
synthesizer = GaussianCopulaSynthesizer(metadata,
                                        enforce_min_max_values=False,
                                        enforce_rounding=True,
                                        locales='en_US'
                                       )

In [None]:
synthesizer.fit(train_data)

In [None]:
synthetic_data = synthesizer.sample(num_rows=2000)

In [None]:
synthetic_data.head()

### Visual inspection

In [None]:
train_data.groupby('container_group').count()

In [None]:
synthetic_data.groupby('container_group').count()

### Sanity checks
This ensures all primary keys are unique and non-null, discrete values in the synthetic data must adhere to the same categories as the real data, all column names present in training data are present in the synthetic data as well

In [None]:
from sdv.evaluation.single_table import run_diagnostic

In [None]:
# perform basic validity checks
diagnostic = run_diagnostic(
    real_data=train_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

In [None]:
from sdv.evaluation.single_table import get_column_plot

In [None]:
# plot the data
fig = get_column_plot(
    real_data=train_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_name='disk_capacity_tb'
)
    
fig.show()

In [None]:
# plot the data
fig = get_column_plot(
    real_data=train_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_name='container_group'
)
    
fig.show()

In [None]:
# plot the data
fig_operation = get_column_plot(
    real_data=train_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_name='operation'
)
    
fig_operation.show()

In [None]:
# plot the data
fig = get_column_plot(
    real_data=train_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_name='chunk_size'
)
    
fig.show()

### Data quality reports

In [None]:
!pip install sdmetrics

In [None]:
from sdmetrics.reports.single_table import QualityReport

In [None]:
report = QualityReport()

The below metrics are best compared between synthetic data (free of conditions) and the training data. The reason being that the conditional synthetic data has set parameters. For example, with disk capacity fixed at 16 TB, we can't expect the disk capacity column in the synthetic data to match the distrution of the corresponding column in the original data that has a range from 10-32 TB.

#### Column shape
The shape of a column describes its overall distribution. The higher the score, the more similar the distributions of real and synthetic data

#### Column pair trends
The trend between two columns describes how they vary in relation to each other, for example the correlation. The higher the score, the more the trends are alike. In our case, we don't pay attention to this score since most of our columns are categorical and unrelated by nature.

In [None]:
report.generate(train_data.sample(n=2000), synthetic_data, metadata.to_dict())

In [None]:
report.get_properties()

This score needs to be looked at on a per-attribute level as opposed to the overall score since a lot of columns do not make sense when 'synthesized', for example ID based columns.

In [None]:
report.get_details(property_name='Column Shapes')

In [None]:
report.save(filepath='results/quality_report_mar_04.pkl')

In [None]:
from sdmetrics.reports.single_table import QualityReport

In [None]:
# quality_report = QualityReport.load('results/quality_report_feb_22.pkl')

In [None]:
fig = report.get_visualization(property_name='Column Shapes')

# Update the title of the plot
fig.update_layout(title='Data Quality for Gaussian Copula: Column Shapes (Average Score = 0.74)')

# Find the index of the trace corresponding to TVComplement
tvcomplement_index = None
for i, trace in enumerate(fig.data):
    if trace.name == 'TVComplement':
        tvcomplement_index = i
        break

# Update the color of the bars represented by TVComplement
if tvcomplement_index is not None:
    fig.data[tvcomplement_index].marker.color = '#00e1c9'

fig.show()

### Conditional synthetic data generation

In [None]:
import pickle
import random
from datetime import timedelta

#### Model to predict number of transactions based on day, month, disk capacity and container_group

In [None]:
class DataModel:
    def predict(self, day, month, disk_capacity, container_group):
        container_group_encoded = self.label_encoder.transform([container_group])[0]
        # Prepare input for prediction
        sample_input = pd.DataFrame({
            'disk_capacity_tb': [disk_capacity],
            'container_group_encoded': [container_group_encoded],
            'month': [month],
            'day': [day],
            'day_sin': [np.sin(day * (2. * np.pi / 31))],
            'day_cos': [np.cos(day * (2. * np.pi / 31))],
            'month_sin': [np.sin((month - 1) * (2. * np.pi / 12))],
            'month_cos': [np.cos((month - 1) * (2. * np.pi / 12))]
        })

        return int(self.model.predict(sample_input)[0])
    pass

In [None]:
with open('SamplePredictor.pickle', 'rb') as f:
    model = pickle.load(f)

#### Condition generator

In [None]:
def generate_conditions(start_date, end_date, disk_capacity, container_groups):
    conditions = []
    
    current_date = start_date
    delta = timedelta(days=1)
    
    cg = dict()
    for container_group, container_id_types in container_groups.items():
        cg[container_group] = random.sample(container_dict[container_group], container_id_types)
    
    while current_date <= end_date:
        for container_group in container_groups.keys():
            num_rows = model.predict(day=current_date.day, month=current_date.month, disk_capacity=disk_capacity, container_group=container_group)
            for container_id in cg[container_group]:
                conditions.extend([Condition(num_rows=int(num_rows//len(cg[container_group])), 
                                         column_values={'datetime_column': current_date, 
                                                        'container_group': container_group,
                                                        'container_id': container_id
                                                       }
                                        )])
        current_date += delta
    return conditions

In [None]:
list(train_data['container_group'].unique())

In [None]:
train_data['datetime_column'].describe()

In [None]:
# defining variables to be passed to the synthesizer
start_date = datetime.strptime('2022-01-27', '%Y-%m-%d')
end_date = datetime.strptime('2022-01-28', '%Y-%m-%d')
disk_capacity_tb = 20 # Numberical Value
# this translates to I want 2 container_ids of type X and 3 container_ids of type Y
container_groups = {'X':2, 'Y':3}

In [None]:
conditions = generate_conditions(start_date, end_date, disk_capacity, container_groups)

In [None]:
conditional_synthetic_data = synthesizer.sample_from_conditions(conditions=conditions)

In [None]:
conditional_synthetic_data['disk_capacity_tb'] = disk_capacity_tb

In [None]:
conditional_synthetic_data

### Experimented with different models that predict the number of transactions

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
df = train_data
df['month'] = df['datetime_column'].dt.month
df['day'] = df['datetime_column'].dt.day

agg_data = df.groupby(['day', 'month', 'container_group', 'disk_capacity_tb']).size().reset_index(name='transactions')

xgb_model = XGBRegressor(n_estimators=320, learning_rate=0.009, max_depth=10, random_state=42)

# label encoding for categorical variables
label_encoder = LabelEncoder()
agg_data['container_group_encoded'] = label_encoder.fit_transform(agg_data['container_group'])
# agg_data['operation_encoded'] = label_encoder.fit_transform(agg_data['operation'])

X = agg_data[['day', 'month', 'container_group_encoded', 'disk_capacity_tb']]
y = agg_data['transactions']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# standardize features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# fit the model
xgb_model.fit(X_train_scaled, y_train)

# model evaluation
y_pred = xgb_model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error = {mae}')

In [None]:
# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5, verbose=1)

# Perform grid search
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model using Mean Squared Error
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error on Test Set: {mae}')

In [None]:
# sample input
new_input = [6,10,4,20]
new_input_scaled = scaler.transform([new_input])

# making a prediction
prediction = xgb_model.predict(new_input_scaled)
print(f'Predicted number of rows = {abs(int(prediction[0]))}')

In [None]:
import pickle
class DataModel:
    def predict(self, day, month, disk_capacity, container_group):
        container_group_encoded = self.label_encoder.transform([container_group])[0]
        # Prepare input for prediction
        sample_input = pd.DataFrame({
            'disk_capacity_tb': [disk_capacity],
            'container_group_encoded': [container_group_encoded],
            'month': [month],
            'day': [day],
            'day_sin': [np.sin(day * (2. * np.pi / 31))],
            'day_cos': [np.cos(day * (2. * np.pi / 31))],
            'month_sin': [np.sin((month - 1) * (2. * np.pi / 12))],
            'month_cos': [np.cos((month - 1) * (2. * np.pi / 12))]
        })

        return int(self.model.predict(sample_input)[0])
    pass
with open('SamplePredictor.pickle', 'rb') as f:
    data_model = pickle.load(f)
    print(data_model.predict(day=1, month=1, disk_capacity=20, container_group='02892102A8F17B5A551466B444222F4C3D9A399F'))

### Validation of data quality

In [None]:
import seaborn as sns
import hashlib
from tqdm import tqdm
from scipy.stats import ks_2samp

In [None]:
class DataQValidation:

    def __init__(self, original, synthetic, start_date, end_date, disk_capacity_tb, container_groups):
        self.original_df = original
        self.synthetic_df = synthetic

        self.start_date = start_date
        self.end_date = end_date
        self.disk_cap = disk_capacity_tb
        self.container_groups = container_groups

        self.original_df['datetime_column'] = pd.to_datetime(self.original_df['datetime_column'], errors='coerce')
        self.synthetic_df['datetime_column'] = pd.to_datetime(self.synthetic_df['datetime_column'], errors='coerce')

        self.original_df = self.original_df.sort_values(by='datetime_column')
        self.synthetic_df = self.synthetic_df.sort_values(by='datetime_column')

    def unique_data(self):

        def row_hash(row):
            return hashlib.sha256(row.to_string().encode('utf-8')).hexdigest()

        original_hashes = self.original_df.apply(lambda row: row_hash(row), axis=1)
        synthetic_hashes = self.synthetic_df.apply(lambda row: row_hash(row), axis=1)

        tqdm.pandas(desc="Hashing original rows")
        original_hashes = self.original_df.progress_apply(row_hash, axis=1)

        tqdm.pandas(desc="Hashing synthetic rows")
        synthetic_hashes = self.synthetic_df.progress_apply(row_hash, axis=1)

        original_set = set(original_hashes)
        synthetic_set = set(synthetic_hashes)

        exact_matches = original_set.intersection(synthetic_set)

        print("Number of exact matches with original data:", len(exact_matches))

        print("Number of exact matches within synthetic data:", len(synthetic_hashes) - len(synthetic_set))

    def check_capacity(self):

        cap_cumulative_test = True
        positive_chunk = True

        cumulative_sizes = {container_id: 0 for container_id in self.synthetic_df['container_id'].unique()}
        fixed_disk_cap_gb = 16 * 1e9

        for index, row in self.synthetic_df.iterrows():

            if row['chunk_size'] < 0:
                positive_chunk = False

            if row['operation'] == 'WRITE':
                cumulative_sizes[row['container_id']] += row['chunk_size']

            if row['operation'] == 'DELETE_PERFORMED':
                cumulative_sizes[row['container_id']] -= row['chunk_size']

            if cumulative_sizes[row['container_id']] > fixed_disk_cap_gb: # Assuming we get 100% of the theoretical space
                cap_cumulative_test = False

        if cap_cumulative_test:
            print("Cumulative Data Test Passed : Size did not exceed disk capacity for any container at any given time")
        else:
            print("Cumulative Data Test Failed : Size did exceeded disk capacity for some container at a given time")

        if positive_chunk:
            print("Positive chunk size test passed")
        else:
            print("Positive chunk size test failed")
            
        print(cumulative_sizes)
        print(fixed_disk_cap_gb)

    def null_check(self):
        if self.synthetic_df.isnull().values.any():
            print("Null Test Failed : Null values found in the generated data")
        else:
            print("Null Test Passed")

    def range_check(self):

        COUNT_THRESHOLD = 3

        container_group_master = self.original_df['container_group'].unique()

        operations = set(self.synthetic_df['operation'])
        ops_master = set(self.original_df['operation'])
        if not operations.issubset(ops_master):
            print('Operation Range Test Failed : Unkown Operation found')
        else:
            print('Operation Range Test Passed')

        print('\n')
        if self.disk_cap:
            disk_caps = set(self.synthetic_df['disk_capacity_tb'])
            if len(disk_caps) == 1:
                if next(iter(disk_caps)) == self.disk_cap:
                    print("Disk Capacity Range Test Passed : Synthetic data generated for the conditioned disk capacity")
                else:
                    print("Disk Capacity Range Test Failed : Synthetic data generated has different disk capacity than the condition")
            else:
                print("Disk Capacity Range Test Failed : 0 or more than 1 unique disk capacities found in synthetic data")

        print('\n')
        if self.container_groups:
            if not set(self.container_groups.keys()).issubset(container_group_master):
                print("Container Group Range Test cannot be perfomed, synthetic data generated on different container groups than original data")
            else:
                synthetic_cg_master = self.synthetic_df['container_group'].unique()
                if not set(self.container_groups.keys()).issubset(synthetic_cg_master):
                    print("Container Group Range Test Failed : synthetic data is missing some conatiner groups")
                elif not set(synthetic_cg_master).issubset(self.container_groups.keys()):
                    print("Container Group Range Test Failed : synthetic data has extra conatiner groups not passed in condition")
                else:
                    print("Container Group Range Test Passed : synthetic data and conditions have exact container groups")

        synthetic_cg_metadf = self.synthetic_df.groupby(['container_group', 'container_id'])['chunk_id'].count().reset_index()
        transaction_counts = dict(synthetic_cg_metadf.groupby('container_group')['chunk_id'].sum())
        container_counts = dict(synthetic_cg_metadf.groupby('container_group')['container_id'].count())

        predictor = self.load_predictor_model()
        delta = timedelta(days=1)

        transactions_test = True
        container_count_test = True
        total_transactions = 0
        for cg, ccount in self.container_groups.items():
            current_date = self.start_date
            predicted_count = 0
            while current_date <= end_date:
                predicted_count += predictor.predict(day=current_date.day, month=current_date.month, disk_capacity= self.disk_cap, container_group= cg)
                current_date += delta

            if abs(predicted_count - transaction_counts[cg]) > COUNT_THRESHOLD:
                transactions_test = False

            if ccount != container_counts[cg]:
                container_count_test = False

        print('\n')
        if transactions_test:
            print('Transaction Number Test Passed : Number of samples generated is in lieu with models prediction')
        else:
            print('Transaction Number Test Failed : Number of samples generated is not in lieu with models prediction')

        print('\n')
        if container_count_test:
            print('Container Count Test Passed : Number of unique containers found in synthetic data is equal to condition')
        else:
            print('Container Count Test Failed : Number of unique containers found in synthetic data is not equal to condition')

        containers = list(synthetic_cg_metadf['container_id'])

        lid_test = True
        sid_test = True
        cid_test = True
        did_test = True

        for co in containers:
            uog = self.original_df[self.original_df['container_id'] == co]
            uog_lid = set(uog['location_id'].unique())
            uog_sid = set(uog['server_id'].unique())
            uog_cid = set(uog['config_id'].unique())
            uog_did = set(uog['disk_id'].unique())

            us = self.synthetic_df[self.synthetic_df['container_id'] == co]
            us_lid = set(us['location_id'].unique())
            us_sid = set(us['server_id'].unique())
            us_cid = set(us['config_id'].unique())
            us_did = set(us['disk_id'].unique())

            if not us_lid.issubset(uog_lid):
                lid_test = False
            if not us_sid.issubset(uog_sid):
                sid_test = False
            if not us_cid.issubset(uog_cid):
                cid_test = False
            if not us_did.issubset(uog_did):
                did_test = False

        print('\n')
        if lid_test:
            print('Location Uniqueness Test Passed : Container in synthetic data and original data belong to same location id')
        else:
            print('Location Uniqueness Test Failed : Container in synthetic data and original data do not belong to same location id')

        print('\n')
        if sid_test:
            print('Server Uniqueness Test Passed : Container in synthetic data and original data belong to same server id')
        else:
            print('Server Uniqueness Test Failed : Container in synthetic data and original data do not belong to same server id')

        print('\n')
        if cid_test:
            print('Config Uniqueness Test Passed : Container in synthetic data and original data belong to same config id')
        else:
            print('Config Uniqueness Test Failed : Container in synthetic data and original data do not belong to same config id')

        print('\n')
        if did_test:
            print('Disk Uniqueness Test Passed : Container in synthetic data and original data belong to same disk id')
        else:
            print('Disk Uniqueness Test Failed : Container in synthetic data and original data do not belong to same disk id')

        outside_range = self.synthetic_df[(self.synthetic_df['datetime_column'] < self.start_date) | (self.synthetic_df['datetime_column'] > self.end_date)]

        print('\n')
        if outside_range.empty:
            print('Date Range Test Passed : All dates are within the start and end date')
        else:
            print('Date Range Test Failed : Some dates are outside start and end date')

    def load_predictor_model(self):
        class DataModel:
            def predict(self, day, month, disk_capacity, container_group):
                container_group_encoded = self.label_encoder.transform([container_group])[0]
                # Prepare input for prediction
                sample_input = pd.DataFrame({
                    'disk_capacity_tb': [disk_capacity],
                    'container_group_encoded': [container_group_encoded],
                    'month': [month],
                    'day': [day],
                    'day_sin': [np.sin(day * (2. * np.pi / 31))],
                    'day_cos': [np.cos(day * (2. * np.pi / 31))],
                    'month_sin': [np.sin((month - 1) * (2. * np.pi / 12))],
                    'month_cos': [np.cos((month - 1) * (2. * np.pi / 12))]
                })

                return int(self.model.predict(sample_input)[0])
            pass
        with open('SamplePredictor.pickle', 'rb') as f:
            data_model = pickle.load(f)
        return data_model

    def stat_dist(self, col, isCat=False):
        if isCat:
            print(f"Original {col} distribution:\n", self.original_df[col].value_counts(normalize=True))
            print(f"Synthetic {col} distribution:\n", self.synthetic_df[col].value_counts(normalize=True))
        else:
            fig, ax = plt.subplots(figsize=(10, 6))

            # Plotting histograms
            sns.histplot(self.original_df[col], color="skyblue", label='Original', kde=True, ax=ax)
            sns.histplot(self.synthetic_df[col], color="red", label='Synthetic', kde=True, ax=ax, alpha=0.6)

            plt.legend()
            plt.title(f'Distribution of {col}')

            # Summary statistics
            original_stats = self.original_df[col].describe()
            synthetic_stats = self.synthetic_df[col].describe()

            stats_df = pd.DataFrame({'Original': original_stats, 'Synthetic': synthetic_stats})
            stats_text = stats_df.to_string()

            # Adding text box for summary statistics
            plt.text(1.05, 0.95, stats_text, transform=ax.transAxes, fontsize=10, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))

            plt.show()

    def unique_data_by_col(self, col):
        print(f"Unique values in original data {col}:", self.original_df[col].nunique())
        print(f"Unique values in synthetic data {col}:", self.synthetic_df[col].nunique())

    @staticmethod
    def parse_datetime(df):
        df['datetime_column'] = pd.to_datetime(df['datetime_column'], format='%Y-%m-%d %H:%M:%S')
        return df

    def compare_numerical_distributions(self, col):
        """
        Compare the distributions of a numerical column using histograms and the Kolmogorov-Smirnov test.
        """
        plt.figure(figsize=(12, 6))
        sns.histplot(self.original_df[col], color="skyblue", label="Original", kde=True, stat="density", bins=30)
        sns.histplot(self.synthetic_df[col], color="red", label="Synthetic", kde=True, stat="density", bins=30)
        plt.legend()
        plt.title(f'Distribution Comparison for {col}')
        plt.show()

        stat, p = ks_2samp(self.original_df[col], self.synthetic_df[col])
        print(f"Kolmogorov-Smirnov test for {col}: Statistic={stat:.4f}, P-value={p:.4g}")

    def compare_categorical_distributions(self, col):
        """
        Compare the distributions of a categorical column using count plots on the same graph
        for both original and synthetic datasets.
        :param col: The column name for the categorical data.
        """
        original_df_copy = self.original_df.copy()
        synthetic_df_copy = self.synthetic_df.copy()
        original_df_copy['Dataset'] = 'Original'
        synthetic_df_copy['Dataset'] = 'Synthetic'

        combined_df = pd.concat([original_df_copy, synthetic_df_copy], ignore_index=True)

        # Plotting
        plt.figure(figsize=(10, 6))
        sns.countplot(x=col, hue='Dataset', data=combined_df, palette='viridis')
        plt.title(f'Comparison of {col} Distribution Between Original and Synthetic Data')
        plt.xticks(rotation=45)  # Rotate the x-axis labels for better readability
        plt.legend(title='Dataset')
        plt.tight_layout()
        plt.show()

    def temporal_distribution_check(self):
        """
        Validate the consistency of event distribution over time.
        """
        original_timeseries = self.original_df.set_index('datetime_column').resample('M').size()
        synthetic_timeseries = self.synthetic_df.set_index('datetime_column').resample('M').size()

        plt.figure(figsize=(12, 6))
        original_timeseries.plot(label='Original', color='blue')
        synthetic_timeseries.plot(label='Synthetic', color='red')
        plt.legend()
        plt.title('Temporal Distribution Comparison')
        plt.xlabel('Month')
        plt.ylabel('Count')
        plt.show()

    def plot_cdf(self, col):
        """
        Plots the CDF for a numerical column for both original and synthetic datasets.
        :param col: The column name to analyze.
        """
        x_original = np.sort(self.original_df[col])
        y_original = np.arange(1, len(x_original)+1) / len(x_original)
        x_synthetic = np.sort(self.synthetic_df[col])
        y_synthetic = np.arange(1, len(x_synthetic)+1) / len(x_synthetic)

        plt.figure(figsize=(10, 6))
        plt.plot(x_original, y_original, marker='.', linestyle='none', label='Original')
        plt.plot(x_synthetic, y_synthetic, marker='.', linestyle='none', label='Synthetic')
        plt.legend()
        plt.title(f'Cumulative Distribution Function (CDF) of {col}')
        plt.xlabel(col)
        plt.ylabel('CDF')
        plt.show()

    def run_test_suite(self):
        self.null_check()
        print('\n')
        print('-'*50)
        self.stat_dist(col='chunk_size')
        print('\n')
        print('-'*50)
        self.unique_data()
        print('\n')
        print('-'*50)
        self.range_check()
        print('\n')
        print('-'*50)
        self.check_capacity()
        print('\n')
        print('-'*50)
        self.compare_numerical_distributions(col='chunk_size')
        print('\n')
        print('-'*50)
        self.plot_cdf(col='chunk_size')

validation = DataQValidation(train_data, conditional_synthetic_data, start_date, end_date, disk_capacity_tb, container_groups)
validation.run_test_suite()