In [2]:
# Dependencies
import requests
import json
import pandas as pd
import query
import time

pd.set_option('display.max_columns', None)

In [3]:
def getResponse(variables):
    """ function to get a response from the API, returns pandas dataframe with page data """
    try: 
        response = requests.post(query.url, json={'query': query.query, 'variables': variables})
    except requests.exceptions.HTTPError as err:
        print(err.response.status_code)
        print(err.response.text)
        
    """ load the response into a json object """
    data = json.loads(response.content)
    
    try:
        
        """ save the hasNextPage feature to determine further data extraction from API """
        page_flag = data['data']['Page']['pageInfo']['hasNextPage']
        
        """ convert json into a dataframe """
        df = pd.json_normalize(data, record_path =['data', 'Page', 'media'])
        
    except(TypeError) as err:
        """ Error output """
        page_flag = False
        df = pd.DataFrame()
        
    return page_flag, df

In [4]:
def flipping_pages(variables, flag = True):
    """ function to keep making post requests as long as there are pages to be mined """
    start = time.time()
    df = pd.DataFrame()
    while flag:
        flag, data = getResponse(variables)
        df = pd.concat([df, data])
        print(f"Page {variables['page']} has been registered into the dataframe.")
        variables['page'] +=1
    end = time.time()
    print(f"Data mining the API completed after: {end-start} .")
    return df

In [5]:
y =  flipping_pages(query.variables)

Page 1 has been registered into the dataframe.
Page 2 has been registered into the dataframe.
Page 3 has been registered into the dataframe.
Page 4 has been registered into the dataframe.
Page 5 has been registered into the dataframe.
Page 6 has been registered into the dataframe.
Page 7 has been registered into the dataframe.
Page 8 has been registered into the dataframe.
Page 9 has been registered into the dataframe.
Page 10 has been registered into the dataframe.
Page 11 has been registered into the dataframe.
Page 12 has been registered into the dataframe.
Page 13 has been registered into the dataframe.
Page 14 has been registered into the dataframe.
Page 15 has been registered into the dataframe.
Page 16 has been registered into the dataframe.
Page 17 has been registered into the dataframe.
Page 18 has been registered into the dataframe.
Page 19 has been registered into the dataframe.
Page 20 has been registered into the dataframe.
Page 21 has been registered into the dataframe.
P

In [8]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4500 entries, 0 to 49
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               4500 non-null   int64  
 1   status           4500 non-null   object 
 2   genres           4500 non-null   object 
 3   description      4500 non-null   object 
 4   averageScore     2202 non-null   float64
 5   popularity       4500 non-null   int64  
 6   title.english    2069 non-null   object 
 7   title.native     4008 non-null   object 
 8   title.romaji     4500 non-null   object 
 9   startDate.year   4017 non-null   float64
 10  startDate.month  2769 non-null   float64
 11  startDate.day    2579 non-null   float64
 12  endDate.year     3011 non-null   float64
 13  endDate.month    2591 non-null   float64
 14  endDate.day      2413 non-null   float64
dtypes: float64(7), int64(2), object(6)
memory usage: 562.5+ KB


In [9]:
y.describe()

Unnamed: 0,id,averageScore,popularity,startDate.year,startDate.month,startDate.day,endDate.year,endDate.month,endDate.day
count,4500.0,2202.0,4500.0,4017.0,2769.0,2579.0,3011.0,2591.0,2413.0
mean,32625.284444,61.62307,1189.849778,2000.660443,6.688335,14.150834,2003.230488,6.588576,15.131372
std,1605.929247,8.767872,5524.577093,6.433595,3.481061,9.953506,7.237546,3.371435,9.745036
min,30001.0,36.0,5.0,1948.0,1.0,1.0,1948.0,1.0,1.0
25%,31224.75,55.0,63.0,1999.0,4.0,5.0,2000.0,4.0,6.0
50%,32541.5,60.0,146.0,2002.0,7.0,14.0,2004.0,7.0,15.0
75%,33952.25,67.0,478.25,2005.0,10.0,24.0,2007.0,9.0,25.0
max,35612.0,93.0,133338.0,2014.0,12.0,31.0,2022.0,12.0,31.0


In [11]:
y.shape

(4500, 15)

In [None]:
# Save the dataframe into csv (out_data.csv)