Library, Module, and Class Import and Environment Setup

In [None]:
import pandas as pd
import re
import itertools
from sklearn.linear_model import LinearRegression

# Load Limbus_Company_Active_Player.csv file
df = pd.read_csv("Limbus_Company_Active_Player.csv")

# Ensure 'Average Players' and 'DateTime' column exist
df['DateTime'] = pd.to_datetime(df['DateTime'])
df = df.sort_values('DateTime')  # sorts and ensure correct time order
players = df['Average Players']

# The lag range is limited to 30 days to ensure manageable computation 
# while still capturing a diverse set of results
max_lag = 30

Lag Correlation Analysis

In [None]:
correlations = {} # dictionary to keep lag:correlation

# Compute correlation for each lag
for lag in range(1, max_lag + 1):
    lagged = players.shift(lag) # shifts index by lag
    corr = players.corr(lagged) # get the correlation value between current a_n with a_n-lag
    correlations[f'a_n-{lag}'] = corr # put values into dictionary

# Sort the dictionary of lag correlations in descending order based on the absolute value of the correlation
correlations_sorted = dict(sorted(correlations.items(), key=lambda item: abs(item[1]), reverse=True))

# Show results
for lag, corr in correlations_sorted.items():
    print(f"{lag}: correlation = {corr:.4f}")

a_n-1: correlation = 0.9657
a_n-7: correlation = 0.9518
a_n-2: correlation = 0.9493
a_n-6: correlation = 0.9388
a_n-5: correlation = 0.9324
a_n-3: correlation = 0.9308
a_n-8: correlation = 0.9301
a_n-14: correlation = 0.9271
a_n-4: correlation = 0.9245
a_n-9: correlation = 0.9171
a_n-13: correlation = 0.9089
a_n-12: correlation = 0.9052
a_n-10: correlation = 0.9004
a_n-15: correlation = 0.8995
a_n-11: correlation = 0.8955
a_n-16: correlation = 0.8829
a_n-21: correlation = 0.8791
a_n-20: correlation = 0.8663
a_n-17: correlation = 0.8641
a_n-19: correlation = 0.8626
a_n-18: correlation = 0.8598
a_n-22: correlation = 0.8570
a_n-28: correlation = 0.8466
a_n-23: correlation = 0.8430
a_n-27: correlation = 0.8310
a_n-24: correlation = 0.8296
a_n-26: correlation = 0.8262
a_n-29: correlation = 0.8226
a_n-25: correlation = 0.8195
a_n-30: correlation = 0.8125


Single-Lag Regression Modeling

In [None]:
results = [] # holds tuples of (lag, alpha, beta, r2)

for label in correlations:
    match = re.search(r'a_n-(\d+)', label) # extract lag values from string-formatted key
    if not match:
        continue # skip if key format is invalid

    k = int(match.group(1)) # convert obtained lag values from string into number

    df_lag = pd.DataFrame({ # create dataframe
        'y': players, # y represents a_n
        f'x_{k}': players.shift(k) # x represents a_n-k
    }).dropna() # removes NaN values

    X = df_lag[[f'x_{k}']].values
    y = df_lag['y'].values

    model = LinearRegression() # create linear regression model
    model.fit(X, y)

    alpha = model.coef_[0] # get model slope
    beta = model.intercept_ # get model intercept
    r2 = model.score(X, y) # get model R-squared

    # Stores tuple (lag, slope, intercept, r2)
    results.append((k, alpha, beta, r2))

# Sort the results by R-squared score in descending order
results.sort(key=lambda x: x[3], reverse=True)

# Print sorted results (top 5)
for k, alpha, beta, r2 in results[:5]:
    print(f"Lag {k:2d}: a_n ≈ {alpha:.4f} * a_(n-{k}) + {beta:.2f} | R² = {r2:.4f}")

Lag  1: a_n ≈ 0.9651 * a_(n-1) + 581.92 | R² = 0.9326
Lag  7: a_n ≈ 0.9510 * a_(n-7) + 832.15 | R² = 0.9059
Lag  2: a_n ≈ 0.9487 * a_(n-2) + 852.65 | R² = 0.9011
Lag  6: a_n ≈ 0.9380 * a_(n-6) + 1065.79 | R² = 0.8814
Lag  5: a_n ≈ 0.9318 * a_(n-5) + 1137.65 | R² = 0.8694


Multi-Lag Regression Modeling

* 2-Lag

In [None]:
lags = list(range(1, max_lag + 1)) # range of lag values from 1 to 30
results = [] # holds tuples of (combination, slope, intercept, r2)

# Test all 2-lag combinations
for r in [2]:
    for combination in itertools.combinations(lags, r):
        lagged_df = pd.DataFrame({'a_n': players}) # create dataframe and a_n as dependent variable
        for k in combination:
            lagged_df[f'a_n-{k}'] = players.shift(k) # a_n-k1 and a_n-k2 as independent variable
        lagged_df = lagged_df.dropna() # removes NaN values

        X = lagged_df.drop(columns='a_n')
        y = lagged_df['a_n']

        model = LinearRegression()
        model.fit(X, y)

        alpha = model.coef_ # get model slope
        beta = model.intercept_ # get model intercept
        r2 = model.score(X, y) # get model R-squared

        # Store tuple (combination, slope, intercept, r2)
        results.append((combination, alpha, beta, r2))

# Sort the results by r2 score in descending order
results.sort(key=lambda x: x[3], reverse=True)

# Print sorted results (top 5)
for combination, coefs, intercept, r2 in results[:5]:
    terms = " + ".join([f"{coef:.4f}·a_(n-{lag})" for coef, lag in zip(coefs, combination)])
    print(f"Combination {combination}: a_n ≈ {terms} + {intercept:.2f} | R² = {r2:.4f}")

Combination (1, 7): a_n ≈ 0.6105·a_(n-1) + 0.3792·a_(n-7) + 183.06 | R² = 0.9512
Combination (1, 14): a_n ≈ 0.7108·a_(n-1) + 0.2802·a_(n-14) + 174.49 | R² = 0.9474
Combination (1, 5): a_n ≈ 0.7176·a_(n-1) + 0.2687·a_(n-5) + 231.90 | R² = 0.9442
Combination (1, 6): a_n ≈ 0.7067·a_(n-1) + 0.2781·a_(n-6) + 259.47 | R² = 0.9438
Combination (1, 13): a_n ≈ 0.7880·a_(n-1) + 0.1968·a_(n-13) + 269.39 | R² = 0.9409


* 3-Lag

In [None]:
lags = list(range(1, max_lag + 1)) # range of lag values from 1 to 30
results = [] # holds tuples of (combination, slope, intercept, r2)

# Test all 3-lag combinations
for r in [3]:
    for combination in itertools.combinations(lags, r):
        lagged_df = pd.DataFrame({'a_n': players}) # create dataframe and a_n as dependent variable
        for k in combination:
            lagged_df[f'a_n-{k}'] = players.shift(k) # a_n-k1, a_n-k2, and a_n-k3 as independent variable
        lagged_df = lagged_df.dropna() # removes NaN values

        X = lagged_df.drop(columns='a_n')
        y = lagged_df['a_n']

        model = LinearRegression()
        model.fit(X, y)

        alpha = model.coef_ # get model slope
        beta = model.intercept_ # get model intercept
        r2 = model.score(X, y) # get model R-squared

        # Store tuple (combination, slope, intercept, r2)
        results.append((combination, alpha, beta, r2))

# Sort the results by r2 score in descending order
results.sort(key=lambda x: x[3], reverse=True)

# Print sorted results (top 5)
for combination, coefs, intercept, r2 in results[:5]:
    terms = " + ".join([f"{coef:.4f}·a_(n-{lag})" for coef, lag in zip(coefs, combination)])
    print(f"Combination {combination}: a_n ≈ {terms} + {intercept:.2f} | R² = {r2:.4f}")

Combination (1, 14, 15): a_n ≈ 0.8746·a_(n-1) + 0.6886·a_(n-14) + -0.5764·a_(n-15) + 225.30 | R² = 0.9652
Combination (1, 7, 8): a_n ≈ 0.7450·a_(n-1) + 0.5944·a_(n-7) + -0.3529·a_(n-8) + 228.67 | R² = 0.9574
Combination (1, 28, 29): a_n ≈ 0.9245·a_(n-1) + 0.5861·a_(n-28) + -0.5262·a_(n-29) + 267.77 | R² = 0.9570
Combination (1, 7, 15): a_n ≈ 0.6859·a_(n-1) + 0.4733·a_(n-7) + -0.1770·a_(n-15) + 307.59 | R² = 0.9549
Combination (1, 14, 22): a_n ≈ 0.7515·a_(n-1) + 0.4549·a_(n-14) + -0.2274·a_(n-22) + 367.66 | R² = 0.9542


Summation-Lag Correlation Analysis

In [None]:
correlation_sums = {} # dictionary to keep sum_lag:correlation

# Compute correlation for each lag summation
for lag in range(2, 31):
    sum_lag = players.copy()
    for i in range(1, lag+1):
        sum_lag += players.shift(i) # calculate summation by lag amount
    
    # Create dataframe
    sum_players = pd.DataFrame({
        'a_n': players, # a_n as dependent variable
        'sum_lag': sum_lag # the sum from a_n-1 to a_n-lag as independent variable
    }).dropna() # removes NaN values

    # get the correlation value between current a_n with sum_lag
    corr = sum_players['a_n'].corr(sum_players['sum_lag']) 
    # put values into dictionary
    correlation_sums[f'sum_lag_{lag}'] = corr

# Sort the dictionary of summation-lag correlations in descending order based on the absolute value of the correlation
sorted_corr = dict(sorted(correlation_sums.items(), key=lambda x: abs(x[1]), reverse=True))

# Show results
for label, corr in sorted_corr.items():
    print(f"{label}: correlation = {corr:.4f}")

sum_lag_2: correlation = 0.9849
sum_lag_3: correlation = 0.9785
sum_lag_4: correlation = 0.9740
sum_lag_7: correlation = 0.9733
sum_lag_5: correlation = 0.9725
sum_lag_6: correlation = 0.9720
sum_lag_8: correlation = 0.9719
sum_lag_9: correlation = 0.9697
sum_lag_10: correlation = 0.9666
sum_lag_11: correlation = 0.9639
sum_lag_12: correlation = 0.9622
sum_lag_14: correlation = 0.9616
sum_lag_13: correlation = 0.9612
sum_lag_15: correlation = 0.9603
sum_lag_16: correlation = 0.9582
sum_lag_17: correlation = 0.9555
sum_lag_18: correlation = 0.9527
sum_lag_19: correlation = 0.9506
sum_lag_20: correlation = 0.9490
sum_lag_21: correlation = 0.9481
sum_lag_22: correlation = 0.9464
sum_lag_23: correlation = 0.9443
sum_lag_24: correlation = 0.9417
sum_lag_25: correlation = 0.9392
sum_lag_26: correlation = 0.9372
sum_lag_27: correlation = 0.9356
sum_lag_28: correlation = 0.9347
sum_lag_29: correlation = 0.9330
sum_lag_30: correlation = 0.9310


Summation-Lag Regression Modeling

In [None]:
summation_models = [] # holds tuples of (lag, alpha, beta, r2)

for k in range(2, 31):
    sum_lag = players.copy()
    for i in range(1, k+1):
        sum_lag += players.shift(i) # calculate summation by lag amount
    
    # Create dataframe
    df_sum = pd.DataFrame({
        'a_n': players, # a_n as dependent variable
        'sum_k': sum_lag # the sum from a_n-1 to a_n-lag as independent variable
    }).dropna() # removes NaN values

    X = df_sum[['sum_k']].values
    y = df_sum['a_n'].values 

    model = LinearRegression() # create linear regression model
    model.fit(X, y)

    alpha = model.coef_[0] # get model slope
    beta = model.intercept_ # get model intercept
    r2 = model.score(X, y) # get model accuracy

    # Store tuple (k, slope, intercept, r2)
    summation_models.append((k, alpha, beta, r2))

# Sort the results by R-squared score in descending order
summation_models.sort(key=lambda x: x[3], reverse=True)

# Print sorted results (top 5)
for k, alpha, beta, r2 in summation_models[:5]:
    print(f"k = {k:2d}: a_n ≈ {alpha:.4f} * SUM(a_(n-1) to a_(n-{k})) + {beta:.2f} | R² = {r2:.4f}")

k =  2: a_n ≈ 0.3327 * SUM(a_(n-1) to a_(n-2)) + 38.06 | R² = 0.9701
k =  3: a_n ≈ 0.2488 * SUM(a_(n-1) to a_(n-3)) + 80.74 | R² = 0.9574
k =  4: a_n ≈ 0.1988 * SUM(a_(n-1) to a_(n-4)) + 104.90 | R² = 0.9487
k =  7: a_n ≈ 0.1247 * SUM(a_(n-1) to a_(n-7)) + 52.02 | R² = 0.9473
k =  5: a_n ≈ 0.1658 * SUM(a_(n-1) to a_(n-5)) + 97.45 | R² = 0.9457
