# Eliminating Invalid Inputs/Outliers + Making Dataset

In [None]:
# Dependencies
!pip3 install pandas
!pip3 install numpy
!pip3 install matplotlib

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json as json
import typing
import tqdm

In [6]:
import requests
from dotenv import load_dotenv
import os

# load environment variables and functions
load_dotenv()
OAUTH = os.getenv('OAUTH')
CLIENT = os.getenv('CLIENT')

def get_game_from_id(id: str) -> list[str]:
    """
    Get the game name + game id + broadcaster name from the streamer id
    return list of game_id and game_name
    """
    url = "https://api.twitch.tv/helix/channels?broadcaster_id=" + id
    headers = {
        'Authorization': 'Bearer ' + OAUTH,
        'Client-Id': CLIENT
    }
    response = requests.get(url, headers=headers)
    game_id = response.json()['data'][0]['game_id']
    game_name = response.json()['data'][0]['game_name']
    streamer_name = response.json()['data'][0]['broadcaster_name']
    return [game_id, streamer_name, game_name]

In [None]:
# Let us make a list of all streamer ids by using target.csv
target = pd.read_csv('musae_ENGB_target.csv')
streamer_ids = target['id'].tolist()

# create empty df that connects streamer ID to game ID and game Name
df = pd.DataFrame(columns=['streamer_id', 'streamer_name', 'game_id', 'game_name'])

# loop through all streamer ids and get game id and game name, using tqdm to show progress
for streamer_id in tqdm.tqdm(streamer_ids):
    try:
        streamer_id = str(streamer_id)
        game_id, streamer_name ,game_name = get_game_from_id(streamer_id)
        df = df.append({'streamer_id': streamer_id, 'streamer_name': streamer_name, 'game_id': game_id, 'game_name': game_name}, ignore_index=True)
    except:
        pass

# save df to csv
df.to_csv('streamer_features.csv', index=False)

In [7]:
# Now, let us clear this dataset for streamers where we couldn't find the game id
df = pd.read_csv('streamer_features.csv')
# drop if game_id is empty
df = df.dropna(subset=['game_id'])
# save df to csv
df.to_csv('streamer_features.csv', index=False)

In [10]:
# find number of nulls and eliminate them
df.isnull().sum()

streamer_id        0
streamer_name      0
game_id            0
game_name        128
dtype: int64

In [12]:
# drop columns where game_name is empty
df = df.dropna(subset=['game_name'])

In [15]:
df.isnull().sum()
df.to_csv('streamer_features.csv', index=False)

In [25]:
# Now that we have sanitized the data, let us take out the edges that are not in the dataset

# if id is not streamer_features.csv but in musae_ENGB_target.csv, drop it from musae_ENGB_target.csv
target = pd.read_csv('musae_ENGB_target.csv')
target = target[target['id'].isin(df['streamer_id'])]
target.to_csv('musae_ENGB_target.csv', index=False)

# repeat the same for target and edges, but this time compare with our decoy id (new_id)
edges = pd.read_csv('musae_ENGB_edges.csv')
target = pd.read_csv('musae_ENGB_target.csv')
edges = edges[edges['from'].isin(target['new_id'])]
edges = edges[edges['to'].isin(target['new_id'])]

# if first row is not blank but second row is blank, drop it and vice versa
edges = edges.dropna(subset=['from'])
edges = edges.dropna(subset=['to'])

edges.to_csv('musae_ENGB_edges.csv', index=False)