In [123]:
from jikanpy import Jikan
import json
import time
import pandas as pd

# Initialize Jikan with the v4 base URL
jikan = Jikan(selected_base='https://api.jikan.moe/v4')

In [124]:
jikan = Jikan()

# Placeholder for the extracted data
extracted_data = []

# Define the year ranges you want to fetch data for
year_ranges = [(1990, 1995), (1996, 2000), (2001, 2005), (2006, 2010), (2011, 2015), (2016, 2020), (2021, 2023)]

# Function to fetch and process anime data within a given year range
def fetch_anime_data(start_year, end_year):
    page = 1
    while True:
        try:
            # Fetch data from the API
            response = jikan.search('anime', '', page=page, parameters={
                'start_date': f'{start_year}-01-01',
                'end_date': f'{end_year}-12-31',
                'type': 'tv',
            })
            
            # Process each anime in the current page's response
            for anime in response['data']:
                mal_id = anime.get('mal_id')
                score = anime.get('score')
                rank = anime.get('rank')
                popularity = anime.get('popularity')
                season = anime.get('season')
                year = anime.get('year')

                # Extracting titles
                title_default = None
                title_japanese = None

                for title_entry in anime.get('titles', []):
                    if title_entry['type'] == 'Default':
                        title_default = title_entry['title']
                    elif title_entry['type'] == 'Japanese':
                        title_japanese = title_entry['title']

                # Extract all genre names as a comma-separated string
                genres = ', '.join([genre['name'] for genre in anime.get('genres', [])])
                
                # Append the extracted information as a tuple to the list
                extracted_data.append((mal_id, title_default, title_japanese, score, rank, popularity, season, year, genres))
            
            # Check if there's another page
            if not response['pagination']['has_next_page']:
                break
            
            # Move to the next page
            page += 1
            
            # Respect the rate limit
            time.sleep(1.2)  # Slightly more than 1 second to ensure we're under the limit
            
        except Exception as e:
            print(f"An error occurred: {e}")
            break

In [125]:
# Fetch data for each defined year range
for start_year, end_year in year_ranges:
    print(f"Fetching data from {start_year} to {end_year}...")
    fetch_anime_data(start_year, end_year)

# Convert the list of tuples to a pandas DataFrame
df = pd.DataFrame(extracted_data, columns=[
    'mal_id', 'title_default', 'title_japanese', 'score', 'rank', 'popularity', 'season', 'year', 'genres'
])


Fetching data from 1990 to 1995...
Fetching data from 1996 to 2000...
Fetching data from 2001 to 2005...
Fetching data from 2006 to 2010...
Fetching data from 2011 to 2015...
Fetching data from 2016 to 2020...
Fetching data from 2021 to 2023...


In [126]:
df

Unnamed: 0,mal_id,title_default,title_japanese,score,rank,popularity,season,year,genres
0,89,Kidou Senshi Victory Gundam,機動戦士Vガンダム,6.71,5630.0,4211,spring,1993.0,"Action, Drama, Sci-Fi"
1,96,Kidou Butouden G Gundam,機動武闘伝Gガンダム,7.58,1592.0,2649,spring,1994.0,"Action, Drama, Romance, Sci-Fi"
2,103,Akazukin Chacha,赤ずきんチャチャ,7.40,2265.0,5326,winter,1994.0,"Adventure, Comedy, Fantasy, Romance"
3,310,"Chiisana Obake: Acchi, Kocchi, Socchi",小さなおばけ アッチ・ソッチ・コッチ,6.04,9341.0,11236,spring,1991.0,"Comedy, Supernatural"
4,331,Mahoujin Guruguru,魔法陣グルグル,7.59,1556.0,6159,fall,1994.0,"Adventure, Comedy, Fantasy"
...,...,...,...,...,...,...,...,...,...
4715,57952,Jijia Yingxiong: Ji Dou Yongzhe 2nd Season,機甲英雄 機鬥勇者,,20874.0,23308,fall,2023.0,"Action, Sci-Fi"
4716,58016,Mabeobsonyeo Dee Dee 2,마법소녀 디디 2,,16634.0,25229,spring,2021.0,Fantasy
4717,58630,Oshiri Tantei 8th Season,おしりたんてい,,17589.0,23194,fall,2023.0,"Comedy, Fantasy, Mystery"
4718,59147,Tobot V: Ujusuhodae,또봇V 우주수호대,,14301.0,26969,spring,2021.0,"Action, Sci-Fi"


In [127]:
import os

# Get the current working directory (which should be your project root)
script_dir = os.path.dirname(os.path.realpath('test.ipynb'))

# Construct the path to the 'data' directory relative to the project root
data_dir = os.path.join(script_dir, '..', 'data')

# Ensure the data directory exists
os.makedirs(data_dir, exist_ok=True)

# Define the file path for saving your DataFrame with UTF-8 encoding
file_path = os.path.join(data_dir, 'anime_data.xlsx')
file_path2 = os.path.join(data_dir, 'anime_data.csv')

# Save your DataFrame to the file
df.to_excel(file_path, index=False, engine='openpyxl')
df.to_csv(file_path2, index=False, encoding='utf-8-sig')

print(f"Data saved to {file_path} & {file_path2}.")


Data saved to C:\Users\spata\Documents\GitHub\animearc_gc\nb\..\data\anime_data.xlsx & C:\Users\spata\Documents\GitHub\animearc_gc\nb\..\data\anime_data.csv.


In [132]:
print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4720 entries, 0 to 4719
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mal_id          4720 non-null   int64  
 1   title_default   4720 non-null   object 
 2   title_japanese  4716 non-null   object 
 3   score           3847 non-null   float64
 4   rank            4712 non-null   float64
 5   popularity      4720 non-null   int64  
 6   season          4612 non-null   object 
 7   year            4612 non-null   float64
 8   genres          4720 non-null   object 
dtypes: float64(3), int64(2), object(4)
memory usage: 332.0+ KB
None
             mal_id        score          rank    popularity         year
count   4720.000000  3847.000000   4712.000000   4720.000000  4612.000000
mean   25126.251695     6.857536   7441.718591   7370.244703  2012.099957
std    18348.813522     0.827376   5708.100394   7258.873524     7.972368
min        1.000000     2.890000  

In [133]:
print(df['year'].min())
print(df['year'].max())

1990.0
2023.0


In [134]:
# Split the genres into a list
df['genres'] = df['genres'].str.split(', ')

# Explode the list so each genre gets its own row
df_expanded = df.explode('genres')

In [135]:
df_expanded.head()

Unnamed: 0,mal_id,title_default,title_japanese,score,rank,popularity,season,year,genres
0,89,Kidou Senshi Victory Gundam,機動戦士Vガンダム,6.71,5630.0,4211,spring,1993.0,Action
0,89,Kidou Senshi Victory Gundam,機動戦士Vガンダム,6.71,5630.0,4211,spring,1993.0,Drama
0,89,Kidou Senshi Victory Gundam,機動戦士Vガンダム,6.71,5630.0,4211,spring,1993.0,Sci-Fi
1,96,Kidou Butouden G Gundam,機動武闘伝Gガンダム,7.58,1592.0,2649,spring,1994.0,Action
1,96,Kidou Butouden G Gundam,機動武闘伝Gガンダム,7.58,1592.0,2649,spring,1994.0,Drama


In [144]:
genre_count = df_expanded.groupby(['genres']).size().reset_index(name='count')
genre_count['percentage'] = (genre_count['count'] / genre_count['count'].sum() * 100).round(1)
genre_count

Unnamed: 0,genres,count,percentage
0,,297,2.9
1,Action,1408,13.7
2,Adventure,888,8.7
3,Avant Garde,23,0.2
4,Award Winning,53,0.5
5,Boys Love,28,0.3
6,Comedy,1987,19.4
7,Drama,715,7.0
8,Ecchi,296,2.9
9,Erotica,8,0.1


In [145]:
sort_genre_count = genre_count.sort_values(by='count', ascending=False)
sort_genre_count

Unnamed: 0,genres,count,percentage
6,Comedy,1987,19.4
1,Action,1408,13.7
10,Fantasy,1250,12.2
2,Adventure,888,8.7
16,Sci-Fi,825,8.0
7,Drama,715,7.0
15,Romance,713,6.9
19,Supernatural,436,4.2
17,Slice of Life,421,4.1
14,Mystery,314,3.1
