In [1]:
# Dependencies
import requests
import json
import pandas as pd
import query
import time

pd.set_option('display.max_columns', None)

In [2]:
def getResponse(variables):
    """ function to get a response from the API, returns pandas dataframe with page data """
    try: 
        response = requests.post(query.url, json={'query': query.query, 'variables': variables})
    except requests.exceptions.HTTPError as err:
        print(err.response.status_code)
        print(err.response.text)
        
    """ load the response into a json object """
    data = json.loads(response.content)
    
    try:
        
        """ save the hasNextPage feature to determine further data extraction from API """
        page_flag = data['data']['Page']['pageInfo']['hasNextPage']
        
        """ convert json into a dataframe """
        df = pd.json_normalize(data, record_path =['data', 'Page', 'media'])
        
    except(TypeError) as err:
        """ Error output """
        page_flag = False
        df = pd.DataFrame()
        
    return page_flag, df

In [3]:
def flipping_pages(variables, flag = True):
    """ function to keep making post requests as long as there are pages to be mined """
    start = time.time()
    df = pd.DataFrame()
    while flag:
        flag, data = getResponse(variables)
        df = pd.concat([df, data])
        print(f"Page {variables['page']} has been registered into the dataframe.")
        variables['page'] +=1
    end = time.time()
    print(f"Data mining the API completed after: {end-start} sec.")
    return df

In [4]:
data =  flipping_pages(query.variables)

Page 1 has been registered into the dataframe.
Page 2 has been registered into the dataframe.
Page 3 has been registered into the dataframe.
Page 4 has been registered into the dataframe.
Page 5 has been registered into the dataframe.
Page 6 has been registered into the dataframe.
Page 7 has been registered into the dataframe.
Page 8 has been registered into the dataframe.
Page 9 has been registered into the dataframe.
Page 10 has been registered into the dataframe.
Page 11 has been registered into the dataframe.
Page 12 has been registered into the dataframe.
Page 13 has been registered into the dataframe.
Page 14 has been registered into the dataframe.
Page 15 has been registered into the dataframe.
Page 16 has been registered into the dataframe.
Page 17 has been registered into the dataframe.
Page 18 has been registered into the dataframe.
Page 19 has been registered into the dataframe.
Page 20 has been registered into the dataframe.
Page 21 has been registered into the dataframe.
P

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4500 entries, 0 to 49
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               4500 non-null   int64  
 1   status           4500 non-null   object 
 2   genres           4500 non-null   object 
 3   description      4500 non-null   object 
 4   averageScore     2205 non-null   float64
 5   popularity       4500 non-null   int64  
 6   siteUrl          4500 non-null   object 
 7   title.english    2069 non-null   object 
 8   title.native     4008 non-null   object 
 9   title.romaji     4500 non-null   object 
 10  startDate.year   4017 non-null   float64
 11  startDate.month  2769 non-null   float64
 12  startDate.day    2579 non-null   float64
 13  endDate.year     3011 non-null   float64
 14  endDate.month    2591 non-null   float64
 15  endDate.day      2413 non-null   float64
dtypes: float64(7), int64(2), object(7)
memory usage: 597.7+ KB


In [6]:
data.shape

(4500, 16)

In [7]:
data.head()

Unnamed: 0,id,status,genres,description,averageScore,popularity,siteUrl,title.english,title.native,title.romaji,startDate.year,startDate.month,startDate.day,endDate.year,endDate.month,endDate.day
0,30001,FINISHED,"[Drama, Mystery, Psychological, Thriller]",Everyone faces uncertainty at some point in th...,90.0,59580,https://anilist.co/manga/30001,Monster,MONSTER,MONSTER,1994.0,12.0,5.0,2001.0,12.0,20.0
1,30002,RELEASING,"[Action, Adventure, Drama, Fantasy, Horror, Ps...","His name is Guts, the Black Swordsman, a feare...",93.0,133510,https://anilist.co/manga/30002,Berserk,ベルセルク,Berserk,1989.0,8.0,25.0,,,
2,30003,FINISHED,"[Drama, Mystery, Psychological, Sci-Fi, Thriller]","Humanity, having faced extinction at the end o...",88.0,55697,https://anilist.co/manga/30003,20th Century Boys,20世紀少年,20th Century Boys,1999.0,9.0,27.0,2006.0,4.0,24.0
3,30004,FINISHED,"[Sci-Fi, Slice of Life]",Set hundreds of years in the future after an e...,86.0,21509,https://anilist.co/manga/30004,Yokohama Kaidashi Kikou,ヨコハマ買い出し紀行,Yokohama Kaidashi Kikou,1994.0,4.0,26.0,2006.0,2.0,26.0
4,30007,RELEASING,"[Action, Comedy, Drama, Sports]",Makunouchi Ippo has been bullied his entire li...,86.0,18911,https://anilist.co/manga/30007,,はじめの一歩,Hajime no Ippo,1989.0,9.0,27.0,,,


In [8]:
# Save the dataframe into csv (out_data.csv)
data.to_csv('out_data.csv', index = False)