# Project FriendlyFire

## Data Aquisition

### Script to Scrape Steam Data

This script scrapes data from the Steam platform using the `Steam Web API`. It performs the following tasks:

1. **Initial Setup**: Imports necessary libraries and sets up initial parameters such as API key, directories for saving data, and files for logging errors and storing results.
2. **Function Definitions**: Defines various functions to handle API calls, log errors, load existing IDs, append to files, calculate friendship duration, and process player data.
3. **Gather Steam IDs**: Collects Steam IDs up to a specified depth by traversing the friends' network starting from a given Steam ID.
4. **Process Players in Batches**: Processes the collected Steam IDs in batches using multithreading to retrieve player summaries, owned games, and friends' data.
5. **Save Results**: Saves the collected data in JSON format and logs any errors encountered during the process.

In [None]:
import time
from steam.webapi import WebAPI
import os
import json
import csv
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from collections import deque

#### INITIAL PARAMETERS ####

API_KEY = ""
api = WebAPI(key=API_KEY)
data_dir = "Steam_Scraped_Data"
os.makedirs(data_dir, exist_ok=True)

error_log_file = os.path.join(data_dir, "error_log.txt")
output_csv = os.path.join(data_dir, "all_steam_data.csv")
output_json = os.path.join(data_dir, "all_steam_data.json")

successful_file = os.path.join(data_dir, "successful_ids.txt")
error_401_file = os.path.join(data_dir, "error_401_ids.txt")
retry_file = os.path.join(data_dir, "retry_ids.txt")

starting_steam_id = "76561197979408421"  # Kongzoola
max_depth = 3
max_threads = 6
batch_size = 10
api_call_count = 0
api_limit = 100000  # Daily API call limit
reset_time = datetime.now() + timedelta(days=1)

#### FUNCTIONS ####

def log_error(message):
    with open(error_log_file, 'a') as f:
        f.write(f"{datetime.now().isoformat()} - {message}\n")

def load_existing_ids(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return set(line.strip() for line in f)
    return set()

def append_to_file(file_path, steam_id):
    with open(file_path, 'a') as f:
        f.write(f"{steam_id}\n")

def make_api_call(func, max_retries=3, delay=5, **kwargs):
    global api_call_count, reset_time
    api_call_count += 1

    if api_call_count > api_limit:
        if datetime.now() >= reset_time:
            api_call_count = 0
            reset_time = datetime.now() + timedelta(days=1)
        else:
            print("API limit reached. Pausing until reset...")
            time_until_reset = (reset_time - datetime.now()).total_seconds()
            time.sleep(time_until_reset)
            api_call_count = 0

    for attempt in range(max_retries):
        try:
            return func(**kwargs)
        except Exception as e:
            if "429" in str(e):
                time.sleep(delay * (2 ** attempt))
            else:
                log_error(f"API call failed: {e}")
                break
    log_error(f"Max retries exceeded for API call.")
    return None

def calculate_friendship_duration(friend_since):
    friend_since_date = datetime.fromtimestamp(friend_since)
    duration = datetime.now() - friend_since_date
    return {
        "friend_since": friend_since_date.strftime("%Y-%m-%d"),
        "friendship_duration_days": duration.days
    }

def get_player_summaries_in_batches(steam_ids):
    summaries = []
    batch_size = 100
    for i in range(0, len(steam_ids), batch_size):
        batch = steam_ids[i:i + batch_size]
        try:
            summary_data = make_api_call(api.ISteamUser.GetPlayerSummaries, steamids=",".join(batch))
            if summary_data:
                summaries.extend(summary_data.get("response", {}).get("players", []))
        except Exception as e:
            log_error(f"Error in player summary batch {i // batch_size + 1}: {e}")
    return summaries

def collect_player_data(steam_id):
    global successful_ids, error_401_ids, retry_ids

    if steam_id in successful_ids:
        return None  # Skip already successful IDs

    player_data = {"steam_id": steam_id, "data": {}}
    try:
        player_summaries = get_player_summaries_in_batches([steam_id])
        if player_summaries:
            player_data["data"]["player_info"] = player_summaries[0]

        owned_games = make_api_call(
            api.IPlayerService.GetOwnedGames,
            steamid=steam_id, include_appinfo=True,
            include_played_free_games=True,
            appids_filter=[]
        )
        if owned_games:
            player_data["data"]["owned_games"] = owned_games

        friends = make_api_call(api.ISteamUser.GetFriendList, steamid=steam_id)
        if friends and 'friendslist' in friends:
            friends_data = []
            for friend in friends['friendslist']['friends']:
                friend_id = friend['steamid']
                friend_since = friend.get('friend_since')
                if friend_since:
                    friends_data.append({
                        "friend_id": friend_id,
                        **calculate_friendship_duration(friend_since)
                    })
            player_data["data"]["friends"] = friends_data

        append_to_file(successful_file, steam_id)
        successful_ids.add(steam_id)
    except Exception as e:
        error_message = str(e)
        if "401" in error_message:
            append_to_file(error_401_file, steam_id)
            error_401_ids.add(steam_id)
        else:
            append_to_file(retry_file, steam_id)
            retry_ids.add(steam_id)
        log_error(f"Error processing Steam ID {steam_id}: {e}")

    return player_data

def gather_steam_ids(steam_id, max_depth):
    visited_ids = set([steam_id])
    queue = deque([(steam_id, 0)])  # Store (steam_id, current_depth)

    while queue:
        current_id, depth = queue.popleft()
        if depth >= max_depth:
            continue

        try:
            friends = make_api_call(api.ISteamUser.GetFriendList, steamid=current_id)
            if friends and 'friendslist' in friends:
                for friend in friends['friendslist']['friends']:
                    friend_id = friend['steamid']
                    if friend_id not in visited_ids:
                        visited_ids.add(friend_id)
                        queue.append((friend_id, depth + 1))
        except Exception as e:
            log_error(f"Error retrieving friends for {current_id}: {e}")

    return visited_ids

def process_players_in_batches(steam_ids):
    global batch_counter, starting_steam_id
    player_batch = []
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = {executor.submit(collect_player_data, steam_id): steam_id for steam_id in steam_ids}

        with tqdm(total=len(steam_ids), desc="Processing players", unit="player", dynamic_ncols=True) as pbar:
            for future in as_completed(futures):
                steam_id = futures[future]
                try:
                    result = future.result()
                    if result:
                        player_batch.append(result)
                except Exception as e:
                    log_error(f"Error processing Steam ID {steam_id}: {e}")
                pbar.update(1)

    with open(output_json, 'w') as f:
        json.dump({"players": player_batch}, f, indent=4)

#### MAIN SCRIPT ####

successful_ids = load_existing_ids(successful_file)
error_401_ids = load_existing_ids(error_401_file)
retry_ids = load_existing_ids(retry_file)

all_steam_ids = gather_steam_ids(starting_steam_id, max_depth)
new_steam_ids = list(all_steam_ids - successful_ids - error_401_ids - retry_ids)

process_players_in_batches(new_steam_ids)

### Steam Data Processing Script in R

#### Extracting Relevant Data from BYU Steam Dataset

This script processes Steam data from the aquired BYU dataset using R. It performs the following tasks:

1. **Load Necessary Libraries**: Loads required libraries such as `data.table` and `tcltk`.
2. **File Path Selection**: Provides functions to select input and output file paths using a graphical interface.
3. **Define Variables**: Sets definable variables such as chunk size and output filename.
4. **Verify File Paths**: Checks if the input file exists and verifies the selected columns.
5. **Process File in Chunks**: Reads the input file in chunks to save memory, processes each chunk and aggregates the game genre data into a single comma-separated "Genres" column
6. **Write Output File**: Writes the processed data to an output file in TSV format.

In [None]:
# Load necessary libraries
library(data.table)
library(tcltk)

# Function to get file path for input or output
get_file_path <- function(type = "input") {
  if (type == "input") {
    # Select an input file
    input_file <- tk_choose.files(caption = "Select an input file", filter = matrix(c("CSV Files", "*.csv"), ncol = 2))
    if (length(input_file) == 0 || input_file == "") {
      stop("No input file selected. Exiting...")
    }
    return(input_file)
  } else if (type == "output") {
    # Select an output directory
    output_dir <- tclvalue(tkchooseDirectory())
    if (output_dir == "") {
      stop("No output directory selected. Exiting...")
    }
    return(output_dir)
  }
}

# Definable variables
chunk_size <- 1e6  # Number of rows to read at a time
output_filename <- "steam_game_subset_2.tsv"

# Input/Output Variables
input_file <- get_file_path("input")
output_dir <- get_file_path("output")
output_file <- file.path(output_dir, output_filename)

# Verify file paths
cat("Input file path: ", input_file, "\n")
cat("Output file path: ", output_file, "\n")

# Check if the input file exists
if (!file.exists(input_file)) {
  stop("Input file does not exist: ", input_file)
}

# Define the columns to keep (names without quotation marks)
columns_to_keep <- c(
  "steamid", "personaname", "appid", "Title", "user_loccountrycode", 
  "number_of_friends", "number_of_groups"
)

# Define genre columns (the ones that will contain 1s or 0s)
genre_columns <- c(
  "Game_Genre_Action", "Game_Genre_Free_to_Play", "Game_Genre_Strategy", "Game_Genre_Adventure", 
  "Game_Genre_Indie", "Game_Genre_RPG", "Game_Genre_Animation_Modeling", 
  "Game_Genre_Video_Production", "Game_Genre_Casual", "Game_Genre_Simulation", 
  "Game_Genre_Racing", "Game_Genre_Massively_Multiplayer", "Game_Genre_Sports", 
  "Game_Genre_Early_Access", "Game_Genre_Photo_Editing", "Game_Genre_Utilities", 
  "Game_Genre_Design_Illustration", "Game_Genre_Education", 
  "Game_Genre_Software_Training", "Game_Genre_Web_Publishing", 
  "Game_Genre_Audio_Production", "Game_Genre_Accounting"
)

# Read the header to get column names
header <- fread(input_file, sep = ";", nrows = 0)
col_names <- gsub('"', "", names(header))  # Remove any quotation marks
setnames(header, col_names)  # Normalize column names

# Verify the desired columns exist
missing_columns <- setdiff(columns_to_keep, col_names)
if (length(missing_columns) > 0) {
  stop("The following columns are missing in the input file: ", paste(missing_columns, collapse = ", "))
}

# Define the output columns
output_columns <- c(columns_to_keep, "Genres")

# Process the file in chunks
chunk_start <- 1   # Starting row for the first chunk
chunk_count <- 0

while (TRUE) {
  # Read the next chunk
  chunk <- fread(
    input = input_file,
    sep = ";",
    header = FALSE,            
    skip = chunk_start,        # Skip rows already processed
    nrows = chunk_size,        # Read up to chunk_size rows
    col.names = col_names      # Use consistent column names
  )
  
  # Break the loop if no rows were read
  if (nrow(chunk) == 0) break
  
  # Create the "Genres" column by checking which genre columns are 1
  chunk[, Genres := apply(chunk[, ..genre_columns], 1, function(row) {
    genres <- names(row)[row == 1]
    if (length(genres) > 0) {
      # Remove "Game_Genre_" from each genre name
      cleaned_genres <- gsub("Game_Genre_", "", genres)
      return(paste(cleaned_genres, collapse = ", "))
    } else {
      return(NA)  # No genres selected
    }
  })]
  
  # Ensure the chunk includes "Genres" before writing (do not add "Genres" again in the header)
  chunk <- chunk[, c(columns_to_keep, "Genres"), with = FALSE]
  
  # If it's the first chunk, write the header
  if (chunk_count == 0) {
    fwrite(chunk, file = output_file, sep = "\t", col.names = TRUE)
  } else {
    # For subsequent chunks, do not include the header again
    fwrite(chunk, file = output_file, append = TRUE, sep = "\t", col.names = FALSE)
  }
  
  # Update the start of the next chunk
  chunk_start <- chunk_start + chunk_size
  chunk_count <- chunk_count + 1
  print(chunk_start)
}

#### Extracting SteamID, User Location and Number of Friends

This script processes the generated TSV datafile.

1. **Load Necessary Libraries**: Loads `data.table`.
2. **Define Variables**: Sets definable variables for input and putput filenames, columns to extract and filter parameters.
3. **Function to Process File in Chunks**: Reads the input file in chunks, processes each chunk and removes duplicates
4. **Write Output File**: Writes the selected and deduplicated data to the output file in TSV format.

In [None]:
library(data.table)

### VARIABLES ###
data_input_file <- "E:/OneDrive - University of Utah/U of U/Graduate School/COMP5690/Project/Steam_data_sources/Steam_Games_BYU/steam_game_subset.tsv"
data_output_file <- "steamid_loc_friends.tsv"
selected_columns <- c("steamid", "personaname", "user_loccountrycode", "number_of_friends")
unique_id_column <- "steamid"  # Column to identify unique entries
batch_size <- 1e6

### FUNCTIONS ###
process_large_tsv <- function(input_file, output_file, selected_columns, id_column, batch_size = 10000) {
  # Open the input file for reading
  con <- file(input_file, open = "r")
  
  # Read and validate the header
  header <- strsplit(readLines(con, n = 1), "\t")[[1]]
  col_indices <- which(header %in% selected_columns)
  id_col_index <- which(header == id_column)
  
  if (!all(selected_columns %in% header)) {
    stop("Error: Some selected columns not found in the header. Missing columns: ", 
         paste(setdiff(selected_columns, header), collapse = ", "))
  }
  
  if (!(id_column %in% header)) {
    stop("Error: The id_column '", id_column, "' is not found in the header.")
  }
  
  # Write the header to the output file
  fwrite(
    data.table(t(header[col_indices])),
    file = output_file,
    sep = "\t",
    col.names = FALSE,
    quote = FALSE
  )
  
  # Initialize a data.table to track unique entries
  unique_ids <- data.table()
  
  # Process the file in batches
  while (length(lines <- readLines(con, n = batch_size)) > 0) {
    cat("Processing batch of size:", length(lines), "\n")
    
    # Split lines into columns and create a data.table
    batch_dt <- as.data.table(do.call(rbind, strsplit(lines, "\t")))
    
    # Handle cases where row lengths may not match the header
    if (ncol(batch_dt) != length(header)) {
      cat("Warning: Row length mismatch detected. Skipping malformed rows.\n")
      batch_dt <- batch_dt[, seq_along(header), with = FALSE]
    }
    
    setnames(batch_dt, header)
    
    # Select relevant columns
    batch_dt <- batch_dt[, ..col_indices]
    
    # Remove rows with empty or NA values
    batch_dt <- batch_dt[!apply(batch_dt, 1, function(row) any(row == "" | is.na(row)))]
    
    # Deduplicate within the batch and exclude already processed IDs
    batch_dt <- unique(batch_dt)
    batch_dt <- batch_dt[!batch_dt[[id_column]] %in% unique_ids[[id_column]]]
    
    # Update the unique ID tracking table
    if (nrow(batch_dt) > 0) {
      unique_ids <- rbindlist(list(unique_ids, batch_dt[, .(get(id_column))]), use.names = FALSE)
      
      # Write the processed batch to the output file
      fwrite(batch_dt, output_file, sep = "\t", append = TRUE, col.names = FALSE)
      cat("Wrote", nrow(batch_dt), "unique rows to the output file.\n")
    } else {
      cat("No new unique entries found in this batch.\n")
    }
  }
  
  close(con)  # Close the input file connection
  cat("Processing complete. Output saved to:", output_file, "\n")
}

### MAIN ###
process_large_tsv(data_input_file, data_output_file, selected_columns, unique_id_column, batch_size)