Space for intro and the likes


In [1]:
# Install darts library
!pip install darts



In [2]:
import pandas as pd
from pathlib import Path
from darts import TimeSeries
import numpy as np
import pickle

# Define the path to the pickles directory
PICKLES_PATH = Path('pickles')

def load_pickle(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Load all pickled data
ark_players = load_pickle(PICKLES_PATH / 'ark_players.pkl')
ark_reviews = load_pickle(PICKLES_PATH / 'ark_reviews.pkl')
ark_updates = load_pickle(PICKLES_PATH / 'ark_updates_analysis.pkl')

isle_merged = load_pickle(PICKLES_PATH / 'isle_merged.pkl')
isle_reviews = load_pickle(PICKLES_PATH / 'isle_reviews.pkl')

nms_players = load_pickle(PICKLES_PATH / 'noman_players.pkl')
nms_reviews = load_pickle(PICKLES_PATH / 'nms_reviews.pkl')
nms_updates = load_pickle(PICKLES_PATH / 'nms_updates_analysis.pkl')

In [3]:
#Checking the data that got brought in before merging to one for DARTS
def explore_data(data, name, level=0):
    indent = "  " * level
    print(f"\n{indent}{'='*50}")
    print(f"{indent}{name} Data Exploration")
    print(f"{indent}{'='*50}")
    
    if isinstance(data, pd.DataFrame):
        print(f"{indent}Type: pandas DataFrame")
        print(f"{indent}Shape: {data.shape}")
        print(f"{indent}Columns:")
        for col in data.columns:
            print(f"{indent}- {col}")
        print(f"{indent}Data Types:")
        print(data.dtypes)
        print(f"{indent}First few rows:")
        print(data.head())
        print(f"{indent}Descriptive Statistics:")
        print(data.describe(include='all'))
    
    elif isinstance(data, dict):
        print(f"{indent}Type: Dictionary")
        print(f"{indent}Number of keys: {len(data)}")
        print(f"{indent}Keys:")
        for key, value in data.items():
            print(f"{indent}- {key}")
            if isinstance(value, (pd.DataFrame, dict, list, np.ndarray)):
                explore_data(value, f"{name} - {key}", level+1)
            else:
                print(f"{indent}  Value: {value}")
    
    elif isinstance(data, list):
        print(f"{indent}Type: List")
        print(f"{indent}Length: {len(data)}")
        if len(data) > 0:
            print(f"{indent}First element type: {type(data[0])}")
            if len(data) > 5:
                print(f"{indent}First 5 elements: {data[:5]}")
            else:
                print(f"{indent}All elements: {data}")
    
    elif isinstance(data, np.ndarray):
        print(f"{indent}Type: NumPy Array")
        print(f"{indent}Shape: {data.shape}")
        print(f"{indent}Data Type: {data.dtype}")
        if data.ndim == 1 and len(data) <= 10:
            print(f"{indent}Values: {data}")
        elif data.ndim == 2 and data.shape[0] <= 5 and data.shape[1] <= 5:
            print(f"{indent}Values:\n{data}")
        else:
            print(f"{indent}Array is too large to display fully.")
    
    else:
        print(f"{indent}Type: {type(data)}")
        print(f"{indent}Value: {data}")
    
    print(f"{indent}{'='*50}\n")

# Explore each dataset before merging
print("Exploring datasets before merging:")

print("\nARK Datasets:")
explore_data(ark_players, "ARK Players")
explore_data(ark_reviews, "ARK Reviews")
explore_data(ark_updates, "ARK Updates")

print("\nThe Isle Datasets:")
explore_data(isle_merged, "The Isle Merged")
explore_data(isle_reviews, "The Isle Reviews")

print("\nNo Man's Sky Datasets:")
explore_data(nms_players, "No Man's Sky Players")
explore_data(nms_reviews, "No Man's Sky Reviews")
explore_data(nms_updates, "No Man's Sky Updates")

print("\nExploration of individual datasets complete.")

Exploring datasets before merging:

ARK Datasets:

ARK Players Data Exploration
Type: pandas DataFrame
Shape: (110, 3)
Columns:
- Month
- Avg Players
- Peak Players
Data Types:
Month           period[M]
Avg Players       float64
Peak Players        int64
dtype: object
First few rows:
     Month  Avg Players  Peak Players
0  2024-06     23631.25         39597
1  2024-05     21754.11         38243
2  2024-04     22072.68         38737
3  2024-03     21597.90         36713
4  2024-02     21550.53         37022
Descriptive Statistics:
          Month   Avg Players   Peak Players
count       110    110.000000     110.000000
unique      110           NaN            NaN
top     2024-06           NaN            NaN
freq          1           NaN            NaN
mean        NaN  43300.954818   75048.872727
std         NaN  13757.041360   28037.720389
min         NaN      5.280000      28.000000
25%         NaN  35413.797500   61141.750000
50%         NaN  42081.220000   72137.000000
75%         N

In [4]:
def prepare_nms_data(nms_players, nms_updates, nms_reviews):
    # Prepare player data
    if isinstance(nms_players['Month'].dtype, pd.PeriodDtype):
        nms_players['Month'] = nms_players['Month'].dt.to_timestamp()
    else:
        nms_players['Month'] = pd.to_datetime(nms_players['Month'])
    players = nms_players.set_index('Month')

    # Prepare update data
    if isinstance(nms_updates['Month'].dtype, pd.PeriodDtype):
        nms_updates['Month'] = nms_updates['Month'].dt.to_timestamp()
    else:
        nms_updates['Month'] = pd.to_datetime(nms_updates['Month'])
    
    # Group updates by month
    monthly_updates = nms_updates.groupby('Month').agg({
        'Category': 'first',
        'Title': lambda x: ' | '.join(x),
        'Update_Group': 'first',
        'Update_Title': 'first',
        'Patch_Count': 'sum'
    })

    # Prepare review data
    nms_reviews['Month'] = pd.to_datetime(nms_reviews['timestamp_created']).dt.to_period('M').dt.to_timestamp()
    
    # Aggregate review data by month
    monthly_reviews = nms_reviews.groupby('Month').agg({
        'review_id': 'count',
        'voted_up': 'sum',
        'review_length': 'mean',
        'weighted_vote_score': 'mean'
    }).rename(columns={
        'review_id': 'Review_Count',
        'voted_up': 'Positive_Reviews',
        'review_length': 'Avg_Review_Length',
        'weighted_vote_score': 'Avg_Weighted_Vote_Score'
    })
    
    # Calculate negative reviews and positive ratio
    monthly_reviews['Negative_Reviews'] = monthly_reviews['Review_Count'] - monthly_reviews['Positive_Reviews']
    monthly_reviews['Positive_Ratio'] = (monthly_reviews['Positive_Reviews'] / monthly_reviews['Review_Count'] * 100).round(2)

    # Merge all data
    merged = players.join(monthly_updates, how='left').join(monthly_reviews, how='left')

    # Fill missing values
    merged['Category'] = merged['Category'].ffill()
    merged['Title'] = merged['Title'].fillna('No Update')
    merged['Update_Group'] = merged['Update_Group'].ffill()
    merged['Update_Title'] = merged['Update_Title'].ffill()
    merged = merged.fillna({
        'Patch_Count': 0,
        'Review_Count': 0,
        'Positive_Reviews': 0,
        'Negative_Reviews': 0,
        'Positive_Ratio': 0,
        'Avg_Review_Length': 0,
        'Avg_Weighted_Vote_Score': 0
    })

    # Add 'Update' column (1 if there was an update that month, 0 otherwise)
    merged['Update'] = (merged['Patch_Count'] > 0).astype(int)

    # Reset index to make 'Month' a column
    merged = merged.reset_index()

    # Filter data for the specified date range
    start_date = pd.Timestamp('2019-01-01')
    end_date = pd.Timestamp('2024-06-30')
    merged = merged[(merged['Month'] >= start_date) & (merged['Month'] <= end_date)]

    # Reorder columns
    columns = ['Month', 'Avg Players', 'Peak Players', 'Update', 'Category', 'Title', 
               'Update_Group', 'Update_Title', 'Patch_Count', 'Review_Count', 
               'Positive_Reviews', 'Negative_Reviews', 'Positive_Ratio',
               'Avg_Review_Length', 'Avg_Weighted_Vote_Score']
    merged = merged.reindex(columns=columns)

    return merged

# Usage
nms_data = prepare_nms_data(nms_players, nms_updates, nms_reviews)

# Print the first few rows and last few rows to verify
print("No Man's Sky Data:")
print(nms_data.head())
print("\n")
print(nms_data.tail())

# Print a summary of updates with review data
print("\nUpdate Summary with Review Data:")
update_summary = nms_data[nms_data['Update'] == 1][['Month', 'Title', 'Patch_Count', 'Review_Count', 
                                                    'Positive_Reviews', 'Negative_Reviews', 'Positive_Ratio', 
                                                    'Avg_Review_Length', 'Avg_Weighted_Vote_Score']]
print(update_summary)

# Print the date range of the filtered data
print("\nDate Range of Filtered Data:")
print(f"Start Date: {nms_data['Month'].min()}")
print(f"End Date: {nms_data['Month'].max()}")

No Man's Sky Data:
       Month  Avg Players  Peak Players  Update Category              Title  \
0 2024-06-01      8135.78         20046       0      NaN          No Update   
1 2024-05-01      5123.80         14956       1   Update  Adrift Update 4.7   
2 2024-04-01      8735.20         18923       0   Update          No Update   
3 2024-03-01      7438.40         18819       0   Update          No Update   
4 2024-02-01      8006.92         23461       1   Update   Omega Update 4.5   

   Update_Group       Update_Title  Patch_Count  Review_Count  \
0           NaN                NaN          0.0         934.0   
1          30.0  Adrift Update 4.7          1.0         915.0   
2          30.0  Adrift Update 4.7          0.0        2177.0   
3          30.0  Adrift Update 4.7          0.0        1185.0   
4          28.0   Omega Update 4.5          2.0        1137.0   

   Positive_Reviews  Negative_Reviews  Positive_Ratio  Avg_Review_Length  \
0             852.0              82.0  

In [7]:
#Setting variable for the df I want from pickle 
ark_update_data = ark_updates['ark_monthly']

def prepare_ark_data(ark_update_data, ark_reviews, ark_players):
    # Prepare update data
    ark_update_data['Month'] = pd.to_datetime(ark_update_data['Start_Date']).dt.to_period('M')
    
    # Group updates by month
    monthly_updates = ark_update_data.groupby('Month').agg({
        'Major_Version': 'last',
        'Minor_Versions': 'sum',
        'Total_Changes': 'sum',
        'Cumulative_Changes': 'last',
        'Days_Since_Last_Update': 'last',
        'Days_In_Period': 'sum',
        'Update_Frequency': 'mean'
    })

    # Prepare review data
    if 'Date' in ark_reviews.columns:
        ark_reviews['Month'] = ark_reviews['Date'].dt.to_period('M')
    elif 'timestamp_created' in ark_reviews.columns:
        ark_reviews['Month'] = pd.to_datetime(ark_reviews['timestamp_created']).dt.to_period('M')
    elif isinstance(ark_reviews.index, pd.DatetimeIndex):
        ark_reviews['Month'] = ark_reviews.index.to_period('M')
    elif 'Month' in ark_reviews.columns:
        if not isinstance(ark_reviews['Month'].dtype, pd.PeriodDtype):
            ark_reviews['Month'] = pd.to_datetime(ark_reviews['Month']).dt.to_period('M')
    else:
        raise ValueError("Cannot find date information in ark_reviews DataFrame")
    
    # Aggregate review data by month
    monthly_reviews = ark_reviews.groupby('Month').agg({
        'review_id': 'count',
        'voted_up': 'sum',
        'review_length': 'mean',
        'weighted_vote_score': 'mean'
    }).rename(columns={
        'review_id': 'Review_Count',
        'voted_up': 'Positive_Reviews',
        'review_length': 'Avg_Review_Length',
        'weighted_vote_score': 'Avg_Weighted_Vote_Score'
    })
    
    # Calculate negative reviews and positive ratio
    monthly_reviews['Negative_Reviews'] = monthly_reviews['Review_Count'] - monthly_reviews['Positive_Reviews']
    monthly_reviews['Positive_Ratio'] = (monthly_reviews['Positive_Reviews'] / monthly_reviews['Review_Count'] * 100).round(2)

    # Prepare player data
    if 'Date' in ark_players.columns:
        ark_players['Month'] = ark_players['Date'].dt.to_period('M')
    elif isinstance(ark_players.index, pd.DatetimeIndex):
        ark_players['Month'] = ark_players.index.to_period('M')
    elif 'Month' in ark_players.columns:
        if not isinstance(ark_players['Month'].dtype, pd.PeriodDtype):
            ark_players['Month'] = pd.to_datetime(ark_players['Month']).dt.to_period('M')
    else:
        raise ValueError("Cannot find date information in ark_players DataFrame")
    
    ark_players = ark_players.set_index('Month')

    # Merge all data
    merged = ark_players.join(monthly_updates, how='left').join(monthly_reviews, how='left')
    
    # Fill missing values
    merged['Major_Version'] = merged['Major_Version'].ffill()
    merged = merged.fillna({
        'Minor_Versions': 0,
        'Total_Changes': 0,
        'Cumulative_Changes': merged['Cumulative_Changes'].ffill(),
        'Days_Since_Last_Update': merged['Days_Since_Last_Update'].ffill(),
        'Days_In_Period': 30,  # Assume 30 days if missing
        'Update_Frequency': 0,
        'Review_Count': 0,
        'Positive_Reviews': 0,
        'Negative_Reviews': 0,
        'Positive_Ratio': 0,
        'Avg_Review_Length': 0,
        'Avg_Weighted_Vote_Score': 0
    })
    
    # Add 'Update' column (1 if there was an update that month, 0 otherwise)
    merged['Update'] = (merged['Total_Changes'] > 0).astype(int)
    
    # Reset index to make 'Month' a column
    merged = merged.reset_index()
    
    # Convert 'Month' back to datetime for consistency
    merged['Month'] = merged['Month'].dt.to_timestamp()
    
    # Filter data for the specified date range
    start_date = pd.Timestamp('2019-01-01')
    end_date = pd.Timestamp('2024-06-30')
    merged = merged[(merged['Month'] >= start_date) & (merged['Month'] <= end_date)]
    
    # Reorder columns
    columns = ['Month', 'Avg Players', 'Peak Players', 'Update', 'Major_Version', 'Minor_Versions', 
               'Total_Changes', 'Cumulative_Changes', 'Days_Since_Last_Update', 'Days_In_Period', 
               'Update_Frequency', 'Review_Count', 'Positive_Reviews', 'Negative_Reviews', 
               'Positive_Ratio', 'Avg_Review_Length', 'Avg_Weighted_Vote_Score']
    merged = merged.reindex(columns=columns)
    
    return merged

# Usage
ark_data = prepare_ark_data(ark_update_data, ark_reviews, ark_players)

# Print the first few rows and last few rows to verify
print("ARK Data:")
print(ark_data.head())
print("\n")
print(ark_data.tail())

# Print a summary of updates with review data
print("\nUpdate Summary with Review Data:")
update_summary = ark_data[ark_data['Update'] == 1][['Month', 'Major_Version', 'Minor_Versions', 
                                                    'Total_Changes', 'Cumulative_Changes', 
                                                    'Days_Since_Last_Update', 'Update_Frequency', 
                                                    'Review_Count', 'Positive_Reviews', 'Negative_Reviews', 
                                                    'Positive_Ratio', 'Avg_Review_Length', 'Avg_Weighted_Vote_Score']]
print(update_summary)

# Print the date range of the filtered data
print("\nDate Range of Filtered Data:")
print(f"Start Date: {ark_data['Month'].min()}")
print(f"End Date: {ark_data['Month'].max()}")

ARK Data:
       Month  Avg Players  Peak Players  Update  Major_Version  \
0 2024-06-01     23631.25         39597       0            NaN   
1 2024-05-01     21754.11         38243       0            NaN   
2 2024-04-01     22072.68         38737       0            NaN   
3 2024-03-01     21597.90         36713       0            NaN   
4 2024-02-01     21550.53         37022       0            NaN   

   Minor_Versions  Total_Changes  Cumulative_Changes  Days_Since_Last_Update  \
0             0.0            0.0                 NaN                     NaN   
1             0.0            0.0                 NaN                     NaN   
2             0.0            0.0                 NaN                     NaN   
3             0.0            0.0                 NaN                     NaN   
4             0.0            0.0                 NaN                     NaN   

   Days_In_Period  Update_Frequency  Review_Count  Positive_Reviews  \
0            30.0               0.0      

In [10]:
def prepare_isle_data(isle_merged, isle_reviews):
    # Prepare player data
    if 'Month' not in isle_merged.columns:
        raise ValueError("'Month' column not found in isle_merged DataFrame")
    
    if not isinstance(isle_merged['Month'].dtype, pd.PeriodDtype):
        isle_merged['Month'] = pd.to_datetime(isle_merged['Month']).dt.to_period('M')
    player_data = isle_merged.set_index('Month')
    
    # Prepare review data
    if 'Month' not in isle_reviews.columns:
        raise ValueError("'Month' column not found in isle_reviews DataFrame")
    
    if not isinstance(isle_reviews['Month'].dtype, pd.PeriodDtype):
        isle_reviews['Month'] = pd.to_datetime(isle_reviews['Month']).dt.to_period('M')
    
    # Aggregate review data by month
    monthly_reviews = isle_reviews.groupby('Month').agg({
        'review_id': 'count',
        'voted_up': 'sum',
        'review_length': 'mean',
        'weighted_vote_score': 'mean'
    }).rename(columns={
        'review_id': 'Review_Count',
        'voted_up': 'Positive_Reviews',
        'review_length': 'Avg_Review_Length',
        'weighted_vote_score': 'Avg_Weighted_Vote_Score'
    })
    
    # Calculate negative reviews and positive ratio
    monthly_reviews['Negative_Reviews'] = monthly_reviews['Review_Count'] - monthly_reviews['Positive_Reviews']
    monthly_reviews['Positive_Ratio'] = (monthly_reviews['Positive_Reviews'] / monthly_reviews['Review_Count'] * 100).round(2)
    
    # Merge player and review data
    merged = player_data.join(monthly_reviews, how='left')
    
    # Fill missing values
    fill_columns = ['Review_Count', 'Positive_Reviews', 'Negative_Reviews', 'Positive_Ratio', 
                    'Avg_Review_Length', 'Avg_Weighted_Vote_Score']
    merged[fill_columns] = merged[fill_columns].fillna(0)
    
    # Create 'Update' column (1 if there was an update that month, 0 otherwise)
    if 'Update Type' not in merged.columns:
        raise ValueError("'Update Type' column not found in merged DataFrame")
    
    merged['Update'] = np.where(merged['Update Type'].notnull(), 1, 0)
    
    # Reset index to make 'Month' a column
    merged = merged.reset_index()
    
    # Convert 'Month' to datetime for consistency
    merged['Month'] = merged['Month'].dt.to_timestamp()
    
    # Filter data for the specified date range (adjust as needed)
    start_date = pd.Timestamp('2019-01-01')
    end_date = pd.Timestamp('2024-06-30')
    merged = merged[(merged['Month'] >= start_date) & (merged['Month'] <= end_date)]
    
    # Reorder columns
    columns = ['Month', 'Avg Players', 'Peak Players', 'Update', 'Title', 'Category', 'Update Type',
               'Review_Count', 'Positive_Reviews', 'Negative_Reviews', 'Positive_Ratio',
               'Avg_Review_Length', 'Avg_Weighted_Vote_Score']
    
    # Check if all required columns are present
    missing_columns = set(columns) - set(merged.columns)
    if missing_columns:
        raise ValueError(f"Missing columns in merged DataFrame: {', '.join(missing_columns)}")
    
    merged = merged.reindex(columns=columns)
    
    return merged

# Usage
isle_data = prepare_isle_data(isle_merged, isle_reviews)

# Print the first few rows and last few rows to verify
print("Isle Data:")
print(isle_data.head())
print("\n")
print(isle_data.tail())

# Print a summary of updates with review data
print("\nUpdate Summary with Review Data:")
update_summary = isle_data[isle_data['Update'] == 1][['Month', 'Title', 'Category', 'Update Type',
                                                      'Review_Count', 'Positive_Reviews', 'Negative_Reviews',
                                                      'Positive_Ratio', 'Avg_Review_Length', 'Avg_Weighted_Vote_Score']]
print(update_summary)

# Print the date range of the filtered data
print("\nDate Range of Filtered Data:")
print(f"Start Date: {isle_data['Month'].min()}")
print(f"End Date: {isle_data['Month'].max()}")

ValueError: 'Month' column not found in isle_reviews DataFrame