# Installing Packages & Environment Setup

In [None]:
!pip install pandas-gbq --quiet
!pip install google-cloud-bigquery pandas
!pip install --quiet google-cloud-bigquery
from google.colab import auth
auth.authenticate_user()
import pandas as pd
from pandas.io import gbq
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from google.cloud import bigquery

# EXECUTIVE SUMMARY

In [None]:
def executive_summary():
    print("Executive Summary")
    print("------------------")
    print("This project explores the Bitcoin Cash blockchain by analyzing block-level data using BigQuery.")
    print("Our primary objective was to build a predictive model that determines whether a block exceeds")
    print("500KB in size based on metadata such as block version, number, and time since the previous block.")
    print()
    print("After cleaning and preparing 5,000 rows of blockchain data, we trained a Decision Tree Classifier.")
    print("The model achieved strong performance with an overall test accuracy of 91%.")
    print()
    print("Key results:")
    print("- Precision (for large blocks): 0.79")
    print("- Recall (for large blocks): 0.80")
    print("- F1-score (for large blocks): 0.79")
    print()
    print("The model demonstrates solid predictive power and highlights potential for deeper analysis")
    print("on blockchain behavior using machine learning techniques. Future improvements could include")
    print("tuning model hyperparameters, testing ensemble models, or analyzing trends across time windows.")

executive_summary()


# Project Connection

In [None]:
project_id = 'proven-wavelet-457219-u4'
client = bigquery.Client(project = project_id)

# Dataset Description & Preview

In [None]:
def describe_dataset():
    print("Dataset Description")
    print("--------------------")
    print("The dataset used in this project is sourced from BigQuery’s public dataset:")
    print("`bigquery-public-data.crypto_bitcoin_cash.blocks`.")
    print()
    print("Each row in the dataset represents a block on the Bitcoin Cash blockchain.")
    print("The following variables were selected for modeling:")
    print("- size: The size of the block in bytes (used to create the target variable).")
    print("- version: The block version number.")
    print("- number: The block height (its position in the chain).")
    print("- nonce: A value miners vary to find a valid hash (excluded from final model).")
    print("- time_since_last_block: Time difference (in seconds) between this and the previous block.")
    print()
    print("The target variable `label` was engineered as a binary indicator:")
    print("- 1 if the block size > 500,000 bytes")
    print("- 0 otherwise")
    print()
    print("After filtering and cleaning, the final dataset contains 5,000 rows and 4 predictor features.")
    print("This dataset was suitable for a binary classification task.")

describe_dataset()


In [None]:
query = """
SELECT *
FROM `bigquery-public-data.crypto_bitcoin_cash.blocks`
LIMIT 10
"""

result = client.query(query).result().to_dataframe()
result.head()

## EDA RESULTS and VISUALS

# Query 1: Block sizes over time

In [None]:
query1 = """
SELECT
  DATE(timestamp) AS date,
  AVG(size) AS avg_block_size
FROM `bigquery-public-data.crypto_bitcoin_cash.blocks`
GROUP BY date
ORDER BY date
"""
df1 = client.query(query1).result().to_dataframe()

In [None]:
px.line(df1, x='date', y='avg_block_size', title='Average Block Size Over Time').show()

# Query 2: Number of blocks per day

In [None]:
query2 = """
SELECT
  DATE(timestamp) AS date,
  COUNT(*) AS block_count
FROM `bigquery-public-data.crypto_bitcoin_cash.blocks`
GROUP BY date
ORDER BY date
"""
df2 = client.query(query2).result().to_dataframe()

In [None]:
px.line(df2, x='date', y='block_count', title='Number of Blocks Per Day').show()

# Query 3: Average time between blocks

In [None]:
query3 = """
WITH block_times AS (
  SELECT
    timestamp,
    TIMESTAMP_DIFF(timestamp, LAG(timestamp) OVER (ORDER BY timestamp), MINUTE) AS time_diff_min
  FROM
    `bigquery-public-data.crypto_bitcoin_cash.blocks`
)

SELECT
  DATE(timestamp) AS date,
  AVG(time_diff_min) AS avg_time_diff_min
FROM block_times
WHERE time_diff_min IS NOT NULL
GROUP BY date
ORDER BY date
"""
df3 = client.query(query3).result().to_dataframe()

In [None]:
px.line(df3, x='date', y='avg_time_diff_min', title='Average Time Between Blocks (Minutes)').show()

# Predictive Modeling

Query and Prepare the Data

In [None]:
query = """
WITH block_data AS (
  SELECT
    size,
    weight,
    version,
    number,
    nonce,
    TIMESTAMP_DIFF(timestamp, LAG(timestamp) OVER (ORDER BY timestamp), SECOND) AS time_since_last_block
  FROM `bigquery-public-data.crypto_bitcoin_cash.blocks`
)
SELECT *
FROM block_data
WHERE time_since_last_block IS NOT NULL
  AND size IS NOT NULL

LIMIT 5000
"""
df = client.query(query).result().to_dataframe()
df = df.drop(columns=['weight'])
df = df.dropna()
df['label'] = (df['size'] > 500000).astype(int)
print(df.shape)
print(df['label'].value_counts())

Train/Test Split and Feature Prep

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Features and target
X = df[[ 'version', 'number',  'time_since_last_block']]
y = df['label']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Train & Evaluate a Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Initialize and train the model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train_scaled, y_train)

# Predict
y_pred = tree_model.predict(X_test_scaled)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Optional: Visualize confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)


In [None]:
def evaluate_model_performance():
    print("Model Evaluation Summary")
    print("-------------------------")
    print("We trained a Decision Tree Classifier to predict whether a block size exceeds 500KB.")
    print("The model achieved an accuracy of approximately 91% on the test set.")
    print()
    print("Key metrics:")
    print("- Precision (Class 1): 0.79 — Of all blocks predicted as large, 79% were actually large.")
    print("- Recall (Class 1): 0.80 — The model correctly identified 80% of actual large blocks.")
    print("- F1-score (Class 1): 0.79 — Balanced performance on precision and recall for large blocks.")
    print()
    print("The confusion matrix shows:")
    print("- True Negatives: 733")
    print("- False Positives: 47")
    print("- False Negatives: 45")
    print("- True Positives: 175")
    print()
    print("The model performs well overall, with stronger accuracy on the majority class.")
    print("It can be improved by trying techniques like hyperparameter tuning or using ensemble models.")

evaluate_model_performance()


# Managerial insights and takeaways

In [None]:
def managerial_takeaways():
    print("Managerial Insights and Takeaways")
    print("----------------------------------")
    print("1. **Blockchain block size is predictable using metadata:**")
    print("   Variables like block version, position (number), and time between blocks")
    print("   offer meaningful signals that can help anticipate whether a block will be large.")
    print()
    print("2. **Machine learning can effectively support blockchain analysis:**")
    print("   The decision tree model achieved 91% accuracy, suggesting that predictive models")
    print("   can be used for monitoring, optimization, or anomaly detection in blockchain operations.")
    print()
    print("3. **Operational planning opportunities for network scalability:**")
    print("   Knowing which blocks are likely to be large may help miners or network operators")
    print("   better manage bandwidth, node performance, and transaction prioritization.")
    print()
    print("4. **Model interpretability supports decision-making:**")
    print("   The decision tree model is transparent, making it easier to explain to stakeholders")
    print("   and adapt into rule-based systems or dashboards.")
    print()
    print("5. **Data from public sources like BigQuery can power real insights:**")
    print("   This project demonstrates how publicly available blockchain data can be")
    print("   leveraged for business intelligence and innovation.")

managerial_takeaways()
