1. Data Collection: Gather historical cryptocurrency price, volume, and liquidity-related data

In [None]:
import pandas as pd

# Load datasets
df_16 = pd.read_csv("coin_gecko_2022-03-16.csv")
df_17 = pd.read_csv("coin_gecko_2022-03-17.csv")

# Display dataset structure
print("Columns in dataset:", df_16.columns)

# Basic statistics
print("\nSummary of 16th March dataset:")
print(df_16.describe())

print("\nSummary of 17th March dataset:")
print(df_17.describe())

# Merge both datasets on 'symbol' for comparative analysis
merged_df = df_16.merge(df_17, on="symbol", suffixes=("_16", "_17"))

# Compute price, volume, and market cap changes
merged_df["price_change"] = merged_df["price_17"] - merged_df["price_16"]
merged_df["volume_change"] = merged_df["24h_volume_17"] - merged_df["24h_volume_16"]
merged_df["mkt_cap_change"] = merged_df["mkt_cap_17"] - merged_df["mkt_cap_16"]

# Top gainers and losers based on price change
top_gainers = merged_df.nlargest(5, "price_change")[["coin_17", "price_change"]]
top_losers = merged_df.nsmallest(5, "price_change")[["coin_17", "price_change"]]

print("\nTop 5 Gainers:")
print(top_gainers)

print("\nTop 5 Losers:")
print(top_losers)

# Save processed data to CSV
merged_df.to_csv("crypto_comparison.csv", index=False)
print("\nProcessed dataset saved as crypto_comparison.csv")

# Visualization (optional)
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.hist(merged_df["price_change"], bins=30, edgecolor="black")
plt.title("Price Change Distribution")
plt.xlabel("Price Change")
plt.ylabel("Frequency")
plt.show()

2. Data Preprocessing: Handle missing values, clean data, and normalize numerical features.

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Load datasets
df_16 = pd.read_csv("coin_gecko_2022-03-16.csv")
df_17 = pd.read_csv("coin_gecko_2022-03-17.csv")

# Identify missing values
print("Missing values:\n", df_16.isnull().sum())

# Handle missing values - fill missing numerical data with median values
num_cols = ["price", "1h", "24h", "7d", "24h_volume", "mkt_cap"]
df_16[num_cols] = df_16[num_cols].fillna(df_16[num_cols].median())
df_17[num_cols] = df_17[num_cols].fillna(df_17[num_cols].median())

# Drop duplicates (if any)
df_16 = df_16.drop_duplicates()
df_17 = df_17.drop_duplicates()

# Normalize numerical features using Min-Max Scaling
scaler = MinMaxScaler()
df_16[num_cols] = scaler.fit_transform(df_16[num_cols])
df_17[num_cols] = scaler.transform(df_17[num_cols])

# Save cleaned dataset
df_16.to_csv("cleaned_crypto_16.csv", index=False)
df_17.to_csv("cleaned_crypto_17.csv", index=False)

print("Data cleaning and normalization complete! Processed files saved.")

3. Exploratory Data Analysis (EDA): Analyze data patterns, trends, and correlations

In [None]:
import seaborn as sns

# Inspect datasets
print("\nDataset Structure:")
print("March 16 Data:\n", df_16.info())
print("\nMarch 17 Data:\n", df_17.info())

# Handle missing values - fill numerical columns with median
num_cols = ["price", "1h", "24h", "7d", "24h_volume", "mkt_cap"]
df_16[num_cols] = df_16[num_cols].fillna(df_16[num_cols].median())
df_17[num_cols] = df_17[num_cols].fillna(df_17[num_cols].median())

# Merge both datasets on 'symbol' for comparative analysis
merged_df = df_16.merge(df_17, on="symbol", suffixes=("_16", "_17"))

# Compute changes in price, volume, and market cap
merged_df["price_change"] = merged_df["price_17"] - merged_df["price_16"]
merged_df["volume_change"] = merged_df["24h_volume_17"] - merged_df["24h_volume_16"]
merged_df["mkt_cap_change"] = merged_df["mkt_cap_17"] - merged_df["mkt_cap_16"]

# Correlation matrix
corr_matrix = merged_df[["price_16", "price_17", "24h_volume_16", "24h_volume_17", "mkt_cap_16", "mkt_cap_17"]].corr()
print("\nCorrelation Matrix:\n", corr_matrix)

# Visualization - Price Change Distribution
plt.figure(figsize=(10,6))
sns.histplot(merged_df["price_change"], bins=30, edgecolor="black")
plt.title("Price Change Distribution")
plt.xlabel("Price Change")
plt.ylabel("Frequency")
plt.show()

# Visualization - Market Cap vs. Volume Change
plt.figure(figsize=(10,6))
sns.scatterplot(data=merged_df, x="mkt_cap_change", y="volume_change", hue="symbol")
plt.title("Market Cap vs Volume Change")
plt.xlabel("Market Cap Change (USD)")
plt.ylabel("Trading Volume Change (USD)")
plt.show()

# Save processed dataset
merged_df.to_csv("crypto_comparison.csv", index=False)
print("\nProcessed dataset saved as crypto_comparison.csv")

4. Feature Engineering: Create relevant liquidity-related features such as moving averages, volatility, and liquidity ratios.

In [None]:
# Merge datasets on 'coin' to compare changes
df = df_16.merge(df_17, on="coin", suffixes=("_16", "_17"))

# Convert percentages to decimals
for col in ["1h_16", "24h_16", "7d_16", "1h_17", "24h_17", "7d_17"]:
    df[col] = df[col] / 100.0

# --- Liquidity Ratios ---
df["turnover_ratio_16"] = df["24h_volume_16"] / df["mkt_cap_16"]
df["turnover_ratio_17"] = df["24h_volume_17"] / df["mkt_cap_17"]

df["liquidity_score_16"] = (df["24h_volume_16"] * df["price_16"]) / df["mkt_cap_16"]
df["liquidity_score_17"] = (df["24h_volume_17"] * df["price_17"]) / df["mkt_cap_17"]

# --- Moving Averages ---
# Simulating rolling window (7d, 30d) using available 2-day data
df["price_ma_7d"] = (df["price_16"] + df["price_17"]) / 2
df["volume_ma_7d"] = (df["24h_volume_16"] + df["24h_volume_17"]) / 2

# --- Volatility Measures ---
df["volatility_1h"] = np.abs(df["1h_17"] - df["1h_16"])
df["volatility_24h"] = np.abs(df["24h_17"] - df["24h_16"])
df["volatility_7d"] = np.abs(df["7d_17"] - df["7d_16"])

# --- Price Momentum ---
df["momentum_24h"] = df["price_17"] - df["price_16"]
df["momentum_7d"] = df["price_17"] - df["price_ma_7d"]

# --- Relative Market Strength (RMS) ---
df["RMS"] = (df["price_17"] - df["price_ma_7d"]) / df["volatility_7d"]

# Save the engineered dataset
df.to_csv("crypto_liquidity_features.csv", index=False)

print("Feature Engineering Complete! Results saved in 'crypto_liquidity_features.csv'")

5. Model Selection: Choose appropriate machine learning models such as time-series forecasting, regression, or deep learning approaches

In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load datasets
df_16 = pd.read_csv("coin_gecko_2022-03-16.csv")
df_17 = pd.read_csv("coin_gecko_2022-03-17.csv")

# Combine datasets for time-series analysis
df = pd.concat([df_16, df_17])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Sort data by date
df = df.sort_index()

# Select top cryptocurrency for forecasting (Bitcoin example)
btc_df = df[df['coin'] == 'Bitcoin'][['price', '24h_volume', 'mkt_cap']]

# Create moving averages
btc_df['ma_7d'] = btc_df['price'].rolling(window=7).mean()
btc_df['ma_30d'] = btc_df['price'].rolling(window=30).mean()

# Calculate volatility
btc_df['volatility_7d'] = btc_df['price'].rolling(window=7).std()

# Liquidity ratio
btc_df['liquidity_ratio'] = btc_df['24h_volume'] / btc_df['mkt_cap']

# Remove NaN values
btc_df.dropna(inplace=True)

# Plot price trend
plt.figure(figsize=(10,5))
plt.plot(btc_df.index, btc_df['price'], label="Price")
plt.plot(btc_df.index, btc_df['ma_7d'], label="7-Day MA", linestyle="--")
plt.plot(btc_df.index, btc_df['ma_30d'], label="30-Day MA", linestyle="--")
plt.legend()
plt.title("Bitcoin Price Trend")
plt.show()

# --- ARIMA MODEL ---
model_arima = ARIMA(btc_df['price'], order=(5,1,0))
arima_result = model_arima.fit()
btc_df['arima_forecast'] = arima_result.predict()

# --- SARIMA MODEL (Seasonality Consideration) ---
sarima_model = SARIMAX(btc_df['price'], order=(1,1,1), seasonal_order=(1,1,1,7))
sarima_result = sarima_model.fit()
btc_df['sarima_forecast'] = sarima_result.predict()

# --- LSTM MODEL ---
data = btc_df[['price']].values
train_size = int(len(data) * 0.8)
train, test = data[:train_size], data[train_size:]

# Prepare LSTM sequences
def create_sequences(data, seq_length=5):
    X, y = [], []
    for i in range(len(data)-seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(train)
X_test, y_test = create_sequences(test)

# LSTM Model
lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(50),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse')

# Train LSTM Model
lstm_model.fit(X_train, y_train, epochs=10, batch_size=16)

# Predict
pred_lstm = lstm_model.predict(X_test)

# Plot forecast results
plt.figure(figsize=(12,6))
plt.plot(btc_df.index[-len(pred_lstm):], pred_lstm, label="LSTM Forecast")
plt.plot(btc_df.index, btc_df['arima_forecast'], label="ARIMA Forecast", linestyle="--")
plt.plot(btc_df.index, btc_df['sarima_forecast'], label="SARIMA Forecast", linestyle="--")
plt.legend()
plt.title("Bitcoin Liquidity Forecast")
plt.show()

6. Model Training: Train the selected model using the processed dataset.

In [None]:
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Load processed dataset
df = pd.read_csv("crypto_liquidity_features.csv")

# Convert date column to datetime format
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Sort data by date
df = df.sort_index()

# Select top cryptocurrency for forecasting (Example: Bitcoin)
btc_df = df[df['coin'] == 'Bitcoin'][['price', '24h_volume', 'liquidity_ratio']]

# --- ARIMA Model for Price Forecasting ---
model_arima = ARIMA(btc_df['price'], order=(5,1,0))
arima_result = model_arima.fit()
btc_df['arima_forecast'] = arima_result.predict()

# --- SARIMA Model (Seasonality Consideration) ---
sarima_model = SARIMAX(btc_df['price'], order=(1,1,1), seasonal_order=(1,1,1,7))
sarima_result = sarima_model.fit()
btc_df['sarima_forecast'] = sarima_result.predict()

# --- LSTM Model ---
data = btc_df[['price']].values
train_size = int(len(data) * 0.8)
train, test = data[:train_size], data[train_size:]

# Prepare LSTM sequences
def create_sequences(data, seq_length=5):
    X, y = [], []
    for i in range(len(data)-seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(train)
X_test, y_test = create_sequences(test)

# Define LSTM Model
lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(50),
    Dropout(0.2),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse')

# Train LSTM Model
lstm_model.fit(X_train, y_train, epochs=10, batch_size=16)

# Predict using LSTM
pred_lstm = lstm_model.predict(X_test)

# --- Plot Forecast Results ---
plt.figure(figsize=(12,6))
plt.plot(btc_df.index[-len(pred_lstm):], pred_lstm, label="LSTM Forecast")
plt.plot(btc_df.index, btc_df['arima_forecast'], label="ARIMA Forecast", linestyle="--")
plt.plot(btc_df.index, btc_df['sarima_forecast'], label="SARIMA Forecast", linestyle="--")
plt.legend()
plt.title("Bitcoin Liquidity Forecast")
plt.show()

7. Model Evaluation: Assess model performance using metrics such as RMSE, MAE, and R² score.

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the processed dataset containing actual & forecasted values
df = pd.read_csv("crypto_liquidity_features.csv")

# Select Bitcoin data for evaluation (adjust for your target cryptocurrency)
# Ensure the required columns exist in the dataframe
if 'arima_forecast' not in df.columns or 'sarima_forecast' not in df.columns:
    # Generate ARIMA and SARIMA forecasts if not already present
    from statsmodels.tsa.arima.model import ARIMA
    from statsmodels.tsa.statespace.sarimax import SARIMAX

    # Filter Bitcoin data
    btc_df = df[df['coin'] == 'Bitcoin']

    # Generate ARIMA forecast
    arima_model = ARIMA(btc_df['price'], order=(5, 1, 0))
    arima_result = arima_model.fit()
    df.loc[df['coin'] == 'Bitcoin', 'arima_forecast'] = arima_result.predict()

    # Generate SARIMA forecast
    sarima_model = SARIMAX(btc_df['price'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 7))
    sarima_result = sarima_model.fit()
    df.loc[df['coin'] == 'Bitcoin', 'sarima_forecast'] = sarima_result.predict()

# Filter Bitcoin data with forecasts
btc_df = df[df['coin'] == 'Bitcoin'][['price', 'arima_forecast', 'sarima_forecast']]

# Ensure no missing values
btc_df.dropna(inplace=True)

# Define evaluation function
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"Performance of {model_name}:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  R² Score: {r2:.4f}")
    print("-" * 40)

# Evaluate ARIMA model
evaluate_model(btc_df['price'], btc_df['arima_forecast'], "ARIMA")

# Evaluate SARIMA model
evaluate_model(btc_df['price'], btc_df['sarima_forecast'], "SARIMA")

# If using an LSTM model, load predictions (assumed saved in CSV)
try:
    lstm_preds = pd.read_csv("lstm_forecasts.csv")
    evaluate_model(btc_df['price'][-len(lstm_preds):], lstm_preds['lstm_forecast'], "LSTM")
except FileNotFoundError:
    print("LSTM forecast file not found, skipping LSTM evaluation.")

8. Hyperparameter Tuning: Optimize model parameters for better accuracy.

In [None]:
import itertools
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import keras_tuner

# Load dataset
df = pd.read_csv("crypto_liquidity_features.csv")
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Select Bitcoin data (modify for other assets)
btc_df = df[df['coin'] == 'Bitcoin']['price']

# --- ARIMA Hyperparameter Tuning ---
p = d = q = range(0, 4)
pdq_combinations = list(itertools.product(p, d, q))

best_aic = float("inf")
best_order = None

for order in pdq_combinations:
    try:
        model = ARIMA(btc_df, order=order)
        results = model.fit()
        if results.aic < best_aic:
            best_aic = results.aic
            best_order = order
    except:
        continue

print(f"Best ARIMA Order: {best_order} with AIC: {best_aic}")

# --- SARIMA Hyperparameter Tuning ---
P = D = Q = range(0, 3)
seasonal_pdq = list(itertools.product(P, D, Q, [7]))

best_mse = float("inf")
best_seasonal_order = None

for seasonal_order in seasonal_pdq:
    try:
        model = SARIMAX(btc_df, order=(1,1,1), seasonal_order=seasonal_order)
        results = model.fit()
        mse = np.mean(np.square(results.resid))
        if mse < best_mse:
            best_mse = mse
            best_seasonal_order = seasonal_order
    except:
        continue

print(f"Best SARIMA Order: {best_seasonal_order} with MSE: {best_mse}")

# --- LSTM Hyperparameter Tuning ---
def build_lstm_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value=50, max_value=150, step=50), return_sequences=True, input_shape=(5, 1)))
    model.add(LSTM(units=hp.Int('units', min_value=50, max_value=150, step=50)))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer="adam", loss="mse")
    return model

tuner = keras_tuner.RandomSearch(
    build_lstm_model,
    objective="loss",
    max_trials=5
)

# Prepare LSTM data
data = btc_df.values.reshape(-1, 1)
train_size = int(len(data) * 0.8)
train, test = data[:train_size], data[train_size:]

X_train, y_train = [], []
for i in range(len(train) - 5):
    X_train.append(train[i:i+5])
    y_train.append(train[i+5])

X_train, y_train = np.array(X_train), np.array(y_train)

tuner.search(X_train, y_train, epochs=10, batch_size=16)

best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best LSTM Hyperparameters: {best_hyperparameters.values}")

9. Model Testing & Validation: Test the model on unseen data and analyze predictions.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load processed dataset
df = pd.read_csv("crypto_liquidity_features.csv")
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Select Bitcoin data for testing (Modify for other cryptocurrencies)
btc_df = df[df['coin'] == 'Bitcoin'][['price']]

# Split into training & test sets (80% training, 20% test)
train_size = int(len(btc_df) * 0.8)
train, test = btc_df[:train_size], btc_df[train_size:]

# --- ARIMA Model Testing ---
arima_model = ARIMA(train, order=(5,1,0))
arima_result = arima_model.fit()

# Predict ARIMA on test set
arima_preds = arima_result.forecast(steps=len(test))

# --- SARIMA Model Testing ---
sarima_model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,7))
sarima_result = sarima_model.fit()

# Predict SARIMA on test set
sarima_preds = sarima_result.forecast(steps=len(test))

# --- LSTM Model Testing ---
data = train.values
seq_length = 5

def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data)-seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(data)
X_test, y_test = create_sequences(test.values)

# Define LSTM Model
lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(50),
    Dropout(0.2),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse')

# Train LSTM Model
lstm_model.fit(X_train, y_train, epochs=10, batch_size=16)

# Predict using LSTM
lstm_preds = lstm_model.predict(X_test)

# --- Model Evaluation ---
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"{model_name} Performance:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  R² Score: {r2:.4f}")
    print("-" * 40)

evaluate_model(test, arima_preds, "ARIMA")
evaluate_model(test, sarima_preds, "SARIMA")
evaluate_model(y_test, lstm_preds.flatten(), "LSTM")

# --- Visualization ---
plt.figure(figsize=(12,6))
plt.plot(test.index, test, label="Actual Prices", color="black")
plt.plot(test.index, arima_preds, label="ARIMA Predictions", linestyle="--", color="blue")
plt.plot(test.index, sarima_preds, label="SARIMA Predictions", linestyle="--", color="green")
plt.plot(test.index[-len(lstm_preds):], lstm_preds.flatten(), label="LSTM Predictions", linestyle="--", color="red")
plt.legend()
plt.title("Bitcoin Liquidity Forecast vs. Actual Prices")
plt.show()

10. Local Deployment: Deploy the trained model locally using Flask or Streamlit for testing.

In [None]:
from flask import Flask, request, jsonify
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA

app = Flask(__name__)

# Load trained model and dataset
df = pd.read_csv("crypto_liquidity_features.csv")
btc_df = df[df['coin'] == 'Bitcoin']['price']

# Train ARIMA model
model = ARIMA(btc_df, order=(5,1,0))
result = model.fit()

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    steps = data.get("steps", 5)

    forecast = result.forecast(steps=steps).tolist()

    return jsonify({"prediction": forecast})

if __name__ == '__main__':
    app.run(debug=True)