Space for intro and the likes


In [1]:
# Install darts library
!pip install darts

Collecting darts
  Downloading darts-0.30.0-py3-none-any.whl.metadata (52 kB)
     ---------------------------------------- 0.0/52.2 kB ? eta -:--:--
     -------------- ----------------------- 20.5/52.2 kB 330.3 kB/s eta 0:00:01
     -------------------------------------- 52.2/52.2 kB 677.4 kB/s eta 0:00:00
Collecting holidays>=0.11.1 (from darts)
  Downloading holidays-0.54-py3-none-any.whl.metadata (23 kB)
Collecting nfoursid>=1.0.0 (from darts)
  Downloading nfoursid-1.0.1-py3-none-any.whl.metadata (1.9 kB)
Collecting pmdarima>=1.8.0 (from darts)
  Downloading pmdarima-2.0.4-cp310-cp310-win_amd64.whl.metadata (8.0 kB)
Collecting pyod>=0.9.5 (from darts)
  Downloading pyod-2.0.1.tar.gz (163 kB)
     ---------------------------------------- 0.0/163.8 kB ? eta -:--:--
     ------------------- ------------------- 81.9/163.8 kB 2.3 MB/s eta 0:00:01
     -------------------------------------- 163.8/163.8 kB 2.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadat

In [15]:
import pandas as pd
from pathlib import Path
from darts import TimeSeries
import numpy as np
import pickle

# Define the path to the pickles directory
PICKLES_PATH = Path('pickles')

def load_pickle(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Load all pickled data
ark_players = load_pickle(PICKLES_PATH / 'ark_players.pkl')
ark_reviews = load_pickle(PICKLES_PATH / 'ark_reviews.pkl')
ark_updates = load_pickle(PICKLES_PATH / 'ark_updates_analysis.pkl')

isle_merged = load_pickle(PICKLES_PATH / 'isle_merged.pkl')
isle_reviews = load_pickle(PICKLES_PATH / 'isle_reviews.pkl')

nms_players = load_pickle(PICKLES_PATH / 'noman_players.pkl')
nms_reviews = load_pickle(PICKLES_PATH / 'nms_reviews.pkl')
nms_updates = load_pickle(PICKLES_PATH / 'nms_updates_analysis.pkl')

In [25]:
#Checking the data that got brought in before merging to one for DARTS
def explore_data(data, name, level=0):
    indent = "  " * level
    print(f"\n{indent}{'='*50}")
    print(f"{indent}{name} Data Exploration")
    print(f"{indent}{'='*50}")
    
    if isinstance(data, pd.DataFrame):
        print(f"{indent}Type: pandas DataFrame")
        print(f"{indent}Shape: {data.shape}")
        print(f"{indent}Columns:")
        for col in data.columns:
            print(f"{indent}- {col}")
        print(f"{indent}Data Types:")
        print(data.dtypes)
        print(f"{indent}First few rows:")
        print(data.head())
        print(f"{indent}Descriptive Statistics:")
        print(data.describe(include='all'))
    
    elif isinstance(data, dict):
        print(f"{indent}Type: Dictionary")
        print(f"{indent}Number of keys: {len(data)}")
        print(f"{indent}Keys:")
        for key, value in data.items():
            print(f"{indent}- {key}")
            if isinstance(value, (pd.DataFrame, dict, list, np.ndarray)):
                explore_data(value, f"{name} - {key}", level+1)
            else:
                print(f"{indent}  Value: {value}")
    
    elif isinstance(data, list):
        print(f"{indent}Type: List")
        print(f"{indent}Length: {len(data)}")
        if len(data) > 0:
            print(f"{indent}First element type: {type(data[0])}")
            if len(data) > 5:
                print(f"{indent}First 5 elements: {data[:5]}")
            else:
                print(f"{indent}All elements: {data}")
    
    elif isinstance(data, np.ndarray):
        print(f"{indent}Type: NumPy Array")
        print(f"{indent}Shape: {data.shape}")
        print(f"{indent}Data Type: {data.dtype}")
        if data.ndim == 1 and len(data) <= 10:
            print(f"{indent}Values: {data}")
        elif data.ndim == 2 and data.shape[0] <= 5 and data.shape[1] <= 5:
            print(f"{indent}Values:\n{data}")
        else:
            print(f"{indent}Array is too large to display fully.")
    
    else:
        print(f"{indent}Type: {type(data)}")
        print(f"{indent}Value: {data}")
    
    print(f"{indent}{'='*50}\n")

# Explore each dataset before merging
print("Exploring datasets before merging:")

print("\nARK Datasets:")
explore_data(ark_players, "ARK Players")
explore_data(ark_reviews, "ARK Reviews")
explore_data(ark_updates, "ARK Updates")

print("\nThe Isle Datasets:")
explore_data(isle_merged, "The Isle Merged")
explore_data(isle_reviews, "The Isle Reviews")

print("\nNo Man's Sky Datasets:")
explore_data(nms_players, "No Man's Sky Players")
explore_data(nms_reviews, "No Man's Sky Reviews")
explore_data(nms_updates, "No Man's Sky Updates")

print("\nExploration of individual datasets complete.")

Exploring datasets before merging:

ARK Datasets:

ARK Players Data Exploration
Type: pandas DataFrame
Shape: (110, 4)
Columns:
- Month
- Avg Players
- Peak Players
- Date
Data Types:
Month                period[M]
Avg Players            float64
Peak Players             int64
Date            datetime64[ns]
dtype: object
First few rows:
     Month  Avg Players  Peak Players       Date
0  2024-06     23631.25         39597 2024-06-01
1  2024-05     21754.11         38243 2024-05-01
2  2024-04     22072.68         38737 2024-04-01
3  2024-03     21597.90         36713 2024-03-01
4  2024-02     21550.53         37022 2024-02-01
Descriptive Statistics:
          Month   Avg Players   Peak Players                           Date
count       110    110.000000     110.000000                            110
unique      110           NaN            NaN                            NaN
top     2024-06           NaN            NaN                            NaN
freq          1           NaN           

In [45]:
#Merge No Man Sky data into one df to be able to convert to DARTS
def prepare_nms_data(players, updates, reviews):
    # Prepare player data
    players['Date'] = pd.to_datetime(players['Date'])
    players['Month'] = players['Date'].dt.to_period('M')
    players = players.set_index('Month')

    # Prepare update data
    updates['Date'] = pd.to_datetime(updates['Date'])
    updates['Month'] = updates['Date'].dt.to_period('M')
    
    # Group updates by month
    monthly_updates = updates.groupby('Month').agg({
        'Category': 'first',
        'Title': lambda x: ' | '.join(x),
        'Update_Group': 'first',
        'Update_Title': 'first',
        'Patch_Count': 'sum'
    })

    # Prepare review data
    reviews['Date'] = pd.to_datetime(reviews['timestamp_created'])
    reviews['Month'] = reviews['Date'].dt.to_period('M')
    
    # Aggregate review data by month
    monthly_reviews = reviews.groupby('Month').agg({
        'review_id': 'count',
        'voted_up': lambda x: x.sum(),
        'review_length': 'mean',
        'weighted_vote_score': 'mean'
    }).rename(columns={
        'review_id': 'Review_Count',
        'voted_up': 'Positive_Reviews',
        'review_length': 'Avg_Review_Length',
        'weighted_vote_score': 'Avg_Weighted_Vote_Score'
    })
    
    # Calculate negative reviews and positive ratio
    monthly_reviews['Negative_Reviews'] = monthly_reviews['Review_Count'] - monthly_reviews['Positive_Reviews']
    monthly_reviews['Positive_Ratio'] = monthly_reviews['Positive_Reviews'] / monthly_reviews['Review_Count'] * 100

    # Merge all data
    merged = players.join(monthly_updates, how='left').join(monthly_reviews, how='left')

    # Fill missing values
    merged['Category'] = merged['Category'].ffill()
    merged['Title'] = merged['Title'].fillna('No Update')
    merged['Update_Group'] = merged['Update_Group'].ffill()
    merged['Update_Title'] = merged['Update_Title'].ffill()
    merged['Patch_Count'] = merged['Patch_Count'].fillna(0)
    merged['Review_Count'] = merged['Review_Count'].fillna(0)
    merged['Positive_Reviews'] = merged['Positive_Reviews'].fillna(0)
    merged['Negative_Reviews'] = merged['Negative_Reviews'].fillna(0)
    merged['Positive_Ratio'] = merged['Positive_Ratio'].fillna(0)
    merged['Avg_Review_Length'] = merged['Avg_Review_Length'].fillna(0)
    merged['Avg_Weighted_Vote_Score'] = merged['Avg_Weighted_Vote_Score'].fillna(0)

    # Add 'Update' column (1 if there was an update that month, 0 otherwise)
    merged['Update'] = (merged['Patch_Count'] > 0).astype(int)

    # Reset index to make 'Month' a column
    merged = merged.reset_index()

    # Convert 'Month' back to datetime for consistency
    merged['Month'] = merged['Month'].dt.to_timestamp()

    # Filter data for the specified date range
    start_date = pd.Timestamp('2019-01-01')
    end_date = pd.Timestamp('2024-06-30')
    merged = merged[(merged['Month'] >= start_date) & (merged['Month'] <= end_date)]

    # Reorder columns
    columns = ['Month', 'Avg Players', 'Peak Players', 'Update', 'Category', 'Title', 
               'Update_Group', 'Update_Title', 'Patch_Count', 'Review_Count', 
               'Positive_Reviews', 'Negative_Reviews', 'Positive_Ratio',
               'Avg_Review_Length', 'Avg_Weighted_Vote_Score']
    merged = merged.reindex(columns=columns)

    return merged

# Usage
nms_data = prepare_nms_data(players, updates, reviews)

# Print the first few rows and last few rows to verify
print("No Man's Sky Data:")
print(nms_data.head())
print("\n")
print(nms_data.tail())

# Print a summary of updates with review data
print("\nUpdate Summary with Review Data:")
update_summary = nms_data[nms_data['Update'] == 1][['Month', 'Title', 'Patch_Count', 'Review_Count', 
                                                    'Positive_Reviews', 'Negative_Reviews', 'Positive_Ratio', 
                                                    'Avg_Review_Length', 'Avg_Weighted_Vote_Score']]
print(update_summary)

# Print the date range of the filtered data
print("\nDate Range of Filtered Data:")
print(f"Start Date: {nms_data['Month'].min()}")
print(f"End Date: {nms_data['Month'].max()}")

NameError: name 'players' is not defined

In [40]:
#Setting variable for the df I want from pickle 
ark_update_data = ark_updates['ark_monthly']

def prepare_ark_data(ark_update_data, ark_reviews, ark_players):
    # Prepare update data
    ark_update_data['Month'] = pd.to_datetime(ark_update_data['Start_Date']).dt.to_period('M')
    
    # Group updates by month
    monthly_updates = ark_update_data.groupby('Month').agg({
        'Major_Version': 'last',
        'Minor_Versions': 'sum',
        'Total_Changes': 'sum',
        'Cumulative_Changes': 'last',
        'Days_Since_Last_Update': 'last',
        'Days_In_Period': 'sum',
        'Update_Frequency': 'mean'
    })

    # Prepare review data
    ark_reviews['Month'] = ark_reviews['Date'].dt.to_period('M')
    
    # Aggregate review data by month
    monthly_reviews = ark_reviews.groupby('Month').agg({
        'review_id': 'count',
        'voted_up': lambda x: x.sum(),
        'review_length': 'mean',
        'weighted_vote_score': 'mean'
    }).rename(columns={
        'review_id': 'Review_Count',
        'voted_up': 'Positive_Reviews',
        'review_length': 'Avg_Review_Length',
        'weighted_vote_score': 'Avg_Weighted_Vote_Score'
    })
    
    # Calculate negative reviews and positive ratio
    monthly_reviews['Negative_Reviews'] = monthly_reviews['Review_Count'] - monthly_reviews['Positive_Reviews']
    monthly_reviews['Positive_Ratio'] = monthly_reviews['Positive_Reviews'] / monthly_reviews['Review_Count'] * 100

    # Prepare player data
    ark_players['Month'] = ark_players['Date'].dt.to_period('M')
    ark_players = ark_players.set_index('Month')

    # Merge all data
    merged = ark_players.join(monthly_updates, how='left').join(monthly_reviews, how='left')
    
    # Fill missing values
    merged['Major_Version'] = merged['Major_Version'].ffill()
    merged['Minor_Versions'] = merged['Minor_Versions'].fillna(0)
    merged['Total_Changes'] = merged['Total_Changes'].fillna(0)
    merged['Cumulative_Changes'] = merged['Cumulative_Changes'].ffill()
    merged['Days_Since_Last_Update'] = merged['Days_Since_Last_Update'].ffill()
    merged['Days_In_Period'] = merged['Days_In_Period'].fillna(30)  # Assume 30 days if missing
    merged['Update_Frequency'] = merged['Update_Frequency'].fillna(0)
    merged['Review_Count'] = merged['Review_Count'].fillna(0)
    merged['Positive_Reviews'] = merged['Positive_Reviews'].fillna(0)
    merged['Negative_Reviews'] = merged['Negative_Reviews'].fillna(0)
    merged['Positive_Ratio'] = merged['Positive_Ratio'].fillna(0)
    merged['Avg_Review_Length'] = merged['Avg_Review_Length'].fillna(0)
    merged['Avg_Weighted_Vote_Score'] = merged['Avg_Weighted_Vote_Score'].fillna(0)
    
    # Add 'Update' column (1 if there was an update that month, 0 otherwise)
    merged['Update'] = (merged['Total_Changes'] > 0).astype(int)
    
    # Reset index to make 'Month' a column
    merged = merged.reset_index()
    
    # Convert 'Month' back to datetime for consistency
    merged['Month'] = merged['Month'].dt.to_timestamp()
    
    # Filter data for the specified date range
    start_date = pd.Timestamp('2019-01-01')
    end_date = pd.Timestamp('2024-06-30')
    merged = merged[(merged['Month'] >= start_date) & (merged['Month'] <= end_date)]
    
    # Reorder columns
    columns = ['Month', 'Avg Players', 'Peak Players', 'Update', 'Major_Version', 'Minor_Versions', 
               'Total_Changes', 'Cumulative_Changes', 'Days_Since_Last_Update', 'Days_In_Period', 
               'Update_Frequency', 'Review_Count', 'Positive_Reviews', 'Negative_Reviews', 
               'Positive_Ratio', 'Avg_Review_Length', 'Avg_Weighted_Vote_Score']
    merged = merged.reindex(columns=columns)
    
    return merged

# Usage
ark_data = prepare_ark_data(ark_update_data, ark_reviews, ark_players)

# Print the first few rows and last few rows to verify
print("ARK Data:")
print(ark_data.head())
print("\n")
print(ark_data.tail())

# Print a summary of updates with review data
print("\nUpdate Summary with Review Data:")
update_summary = ark_data[ark_data['Update'] == 1][['Month', 'Major_Version', 'Minor_Versions', 
                                                    'Total_Changes', 'Cumulative_Changes', 
                                                    'Days_Since_Last_Update', 'Update_Frequency', 
                                                    'Review_Count', 'Positive_Reviews', 'Negative_Reviews', 
                                                    'Positive_Ratio', 'Avg_Review_Length', 'Avg_Weighted_Vote_Score']]
print(update_summary)

# Print the date range of the filtered data
print("\nDate Range of Filtered Data:")
print(f"Start Date: {ark_data['Month'].min()}")
print(f"End Date: {ark_data['Month'].max()}")

ARK Data:
       Month  Avg Players  Peak Players  Update  Major_Version  \
0 2024-06-01     23631.25         39597       0            NaN   
1 2024-05-01     21754.11         38243       0            NaN   
2 2024-04-01     22072.68         38737       0            NaN   
3 2024-03-01     21597.90         36713       0            NaN   
4 2024-02-01     21550.53         37022       0            NaN   

   Minor_Versions  Total_Changes  Cumulative_Changes  Days_Since_Last_Update  \
0             0.0            0.0                 NaN                     NaN   
1             0.0            0.0                 NaN                     NaN   
2             0.0            0.0                 NaN                     NaN   
3             0.0            0.0                 NaN                     NaN   
4             0.0            0.0                 NaN                     NaN   

   Days_In_Period  Update_Frequency  Review_Count  Positive_Reviews  \
0            30.0               0.0      

In [44]:
def prepare_isle_data(isle_merged, isle_reviews):
    # Prepare player data
    if not isinstance(isle_merged['Month'].dtype, pd.PeriodDtype):
        isle_merged['Month'] = pd.to_datetime(isle_merged['Month']).dt.to_period('M')
    player_data = isle_merged.set_index('Month')

    # Prepare review data
    isle_reviews['Month'] = pd.to_datetime(isle_reviews['Date']).dt.to_period('M')
    
    # Aggregate review data by month
    monthly_reviews = isle_reviews.groupby('Month').agg({
        'review_id': 'count',
        'voted_up': lambda x: x.sum(),
        'review_length': 'mean',
        'weighted_vote_score': 'mean'
    }).rename(columns={
        'review_id': 'Review_Count',
        'voted_up': 'Positive_Reviews',
        'review_length': 'Avg_Review_Length',
        'weighted_vote_score': 'Avg_Weighted_Vote_Score'
    })
    
    # Calculate negative reviews and positive ratio
    monthly_reviews['Negative_Reviews'] = monthly_reviews['Review_Count'] - monthly_reviews['Positive_Reviews']
    monthly_reviews['Positive_Ratio'] = monthly_reviews['Positive_Reviews'] / monthly_reviews['Review_Count'] * 100

    # Merge player and review data
    merged = player_data.join(monthly_reviews, how='left')
    
    # Fill missing values
    merged['Review_Count'] = merged['Review_Count'].fillna(0)
    merged['Positive_Reviews'] = merged['Positive_Reviews'].fillna(0)
    merged['Negative_Reviews'] = merged['Negative_Reviews'].fillna(0)
    merged['Positive_Ratio'] = merged['Positive_Ratio'].fillna(0)
    merged['Avg_Review_Length'] = merged['Avg_Review_Length'].fillna(0)
    merged['Avg_Weighted_Vote_Score'] = merged['Avg_Weighted_Vote_Score'].fillna(0)
    
    # Create 'Update' column (1 if there was an update that month, 0 otherwise)
    merged['Update'] = np.where(merged['Update Type'].notnull(), 1, 0)
    
    # Reset index to make 'Month' a column
    merged = merged.reset_index()
    
    # Convert 'Month' to datetime for consistency
    merged['Month'] = merged['Month'].dt.to_timestamp()
    
    # Filter data for the specified date range (adjust as needed)
    start_date = pd.Timestamp('2019-01-01')
    end_date = pd.Timestamp('2024-06-30')
    merged = merged[(merged['Month'] >= start_date) & (merged['Month'] <= end_date)]
    
    # Reorder columns
    columns = ['Month', 'Avg Players', 'Peak Players', 'Update', 'Title', 'Category', 'Update Type',
               'Review_Count', 'Positive_Reviews', 'Negative_Reviews', 'Positive_Ratio',
               'Avg_Review_Length', 'Avg_Weighted_Vote_Score']
    merged = merged.reindex(columns=columns)
    
    return merged

# Usage
isle_data = prepare_isle_data(isle_merged, isle_reviews)

# Print the first few rows and last few rows to verify
print("Isle Data:")
print(isle_data.head())
print("\n")
print(isle_data.tail())

# Print a summary of updates with review data
print("\nUpdate Summary with Review Data:")
update_summary = isle_data[isle_data['Update'] == 1][['Month', 'Title', 'Category', 'Update Type',
                                                      'Review_Count', 'Positive_Reviews', 'Negative_Reviews',
                                                      'Positive_Ratio', 'Avg_Review_Length', 'Avg_Weighted_Vote_Score']]
print(update_summary)

# Print the date range of the filtered data
print("\nDate Range of Filtered Data:")
print(f"Start Date: {isle_data['Month'].min()}")
print(f"End Date: {isle_data['Month'].max()}")

Isle Data:
       Month  Avg Players  Peak Players  Update  \
0 2024-06-01      4673.34          7292       1   
1 2024-06-01      4673.34          7292       1   
2 2024-05-01      4363.91          7366       1   
3 2024-05-01      4363.91          7366       1   
4 2024-03-01      4138.62          6527       1   

                             Title Category Update Type  Review_Count  \
0                      DevBlog #49  DevBlog       Other         610.0   
1                      DevBlog #48  DevBlog       Other         610.0   
2  Patch 0.14.28.61 now available!    Patch       Patch         558.0   
3                      DevBlog #47  DevBlog       Other         558.0   
4                      DevBlog #46  DevBlog       Other         848.0   

   Positive_Reviews  Negative_Reviews  Positive_Ratio  Avg_Review_Length  \
0             425.0             185.0       69.672131         199.354785   
1             425.0             185.0       69.672131         199.354785   
2             3