In [None]:
from jikanpy import Jikan
import json
import time
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize Jikan with the v4 base URL
jikan = Jikan(selected_base='https://api.jikan.moe/v4')

In [None]:
jikan = Jikan()

# Placeholder for the extracted data
extracted_data = []

# Define the year ranges you want to fetch data for
year_ranges = [(1917,1989), (1990, 1995), (1996, 2000), (2001, 2005), (2006, 2010), (2011, 2015), (2016, 2020), (2021, 2024)]

# Function to fetch and process anime data within a given year range
def fetch_anime_data(start_year, end_year):
    page = 1
    while True:
        try:
            # Fetch data from the API
            response = jikan.search('anime', '', page=page, parameters={
                'start_date': f'{start_year}-01-01',
                'end_date': f'{end_year}-12-31',
                'type': 'tv',
            })
            
            # Process each anime in the current page's response
            for anime in response['data']:
                mal_id = anime.get('mal_id')
                score = anime.get('score')
                rank = anime.get('rank')
                popularity = anime.get('popularity')
                season = anime.get('season')
                year = anime.get('year')
                
                # Extracting titles
                title_default = None
                title_japanese = None

                for title_entry in anime.get('titles', []):
                    if title_entry['type'] == 'Default':
                        title_default = title_entry['title']
                    elif title_entry['type'] == 'Japanese':
                        title_japanese = title_entry['title']

                # Extract all genre names as a comma-separated string
                genres = ', '.join([genre['name'] for genre in anime.get('genres', [])])
                
                # Extract all demographic names as a comma-separated string
                demographics = ', '.join([demographic['name'] for demographic in anime.get('demographics', [])])
                
                # Append the extracted information as a tuple to the list
                extracted_data.append((mal_id, title_default, title_japanese, score, rank, popularity, season, year, demographics, genres))

            
            # Check if there's another page
            if not response['pagination']['has_next_page']:
                break
            
            # Move to the next page
            page += 1
            
            # Respect the rate limit
            time.sleep(1.2)  # Slightly more than 1 second to ensure we're under the limit
            
        except Exception as e:
            print(f"An error occurred: {e}")
            break

In [None]:
# Fetch data for each defined year range
for start_year, end_year in year_ranges:
    print(f"Fetching data from {start_year} to {end_year}...")
    fetch_anime_data(start_year, end_year)

# Convert the list of tuples to a pandas DataFrame
df = pd.DataFrame(extracted_data, columns=[
    'mal_id', 'title_default', 'title_japanese', 'score', 'rank', 'popularity', 'season', 'year','demographics', 'genres'
])

In [None]:
df

In [None]:


# Get the current working directory (which should be your project root)
script_dir = os.path.dirname(os.path.realpath('AnimeArc.ipynb'))

# Construct the path to the 'data' directory relative to the project root
data_dir = os.path.join(script_dir, '..', 'data')

# Ensure the data directory exists
os.makedirs(data_dir, exist_ok=True)

# Define the file path for saving your DataFrame with UTF-8 encoding
file_path = os.path.join(data_dir, 'anime_data.xlsx')
file_path2 = os.path.join(data_dir, 'anime_data.csv')

# Save your DataFrame to the file
df.to_excel(file_path, index=False, engine='openpyxl')
df.to_csv(file_path2, index=False, encoding='utf-8-sig')

print(f"Data saved to {file_path} & {file_path2}.")


In [None]:
print(df.info())
print(df.describe())


In [None]:
missing_data_report = df.isnull().sum()
print(missing_data_report)

In [None]:
df_with_score = df[df['score'].notna()]
df_with_score.info()

In [None]:


# Plotting the histogram of scores
plt.figure(figsize=(10, 6))
sns.histplot(df_with_score['score'], bins=20, kde=True)
plt.title('Distribution of Anime Scores')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()

# Filter out anime without year
df_with_score_year = df_with_score[df_with_score['year'].notna()]

df_with_score_year = df_with_score_year.copy()

# Convert year to decade
df_with_score_year['decade'] = ((df_with_score_year['year'] // 10) * 10).astype(int)

df_with_score_year['dec_label'] = df_with_score_year['decade'].astype(str) + 's'

# Plotting the score distribution over the years using a box plot
plt.figure(figsize=(14, 8))
sns.boxplot(x='dec_label', y='score', data=df_with_score_year)
plt.title('Anime Score Distribution Over the Decades')
plt.xlabel('Decade')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.show()

### Analysis of the Distribution of Anime Scores

#### 1. **Score Distribution Histogram (First Graph):**
   - **Normal Distribution:** The histogram of anime scores appears to follow a roughly normal distribution, with the majority of scores centered around 6 to 8. This suggests that most animes tend to receive moderate to good ratings.
   - **Peak Scores:** The most frequent scores fall between 6.5 and 7.5, indicating that the average anime tends to receive a decent score, neither too high nor too low.
   - **Long Tail:** There is a noticeable tail towards both lower and higher scores. This indicates the presence of both underperforming animes (scores below 6) and highly rated animes (scores above 8), but these are less common compared to those that score in the middle range.

#### 2. **Anime Score Distribution Over the Decades (Second Graph):**
   - **1970s to 1980s:**
     - **Score Stability:** The median scores in these decades are relatively consistent, hovering around 6 to 7.
     - **Lower Range of Scores:** The spread of scores in these decades is tighter, with fewer low-scoring animes. This could indicate that early anime productions were more consistent in quality.
   - **1990s:**
     - **Wider Distribution:** The scores start to show more variability, with a broader range from about 5 to 8.5. This suggests increasing diversity in anime quality during this period, with some productions starting to stand out more.
     - **Higher Upper Quartile:** The top 25% of animes from this decade scored noticeably higher, which might reflect the rise of iconic and influential animes during this period.
   - **2000s to 2020s:**
     - **Increased Variability:** The score distribution continues to widen, especially in the lower scores. This could indicate a greater variety of anime content, including niche genres or lower-budget productions that don't always score well.
     - **Higher Median Scores in Recent Decades:** Despite the variability, the median scores in the 2000s and 2010s are higher than in previous decades. This could suggest an improvement in overall anime quality or a change in audience expectations and scoring behavior.
     - **More Outliers:** The recent decades show more outliers, particularly in the lower end, indicating a few very low-scoring animes. This could be due to increased anime production, leading to more variability in quality.

### Key Insights:
   - **Improvement Over Time:** The median score has increased slightly in recent decades, indicating potential improvements in anime quality or changes in scoring criteria.
   - **Diversity in Quality:** The increased spread of scores, especially in the 2000s and 2010s, shows that while there are highly rated animes, there is also a significant number of lower-rated ones, reflecting greater diversity in production.
   - **Consistency in Early Years:** The 1970s and 1980s show less variability in scores, possibly indicating more consistent production quality or a smaller number of releases.
   - **Impact of Iconic Animes:** The 1990s show a notable increase in higher scores, possibly due to the release of several influential animes that shaped the industry.


In [None]:
print(df['rank'].min())
print(df['rank'].max())
print(df['score'].min())
print(df['score'].max())
print(df['popularity'].min())
print(df['popularity'].max())

In [None]:
pop_10 = df.nsmallest(10, 'popularity')
pop_10

In [None]:
rank_10 = df.nsmallest(10, 'rank')
rank_10

In [None]:
score_10 = df.nlargest(10, 'score')
score_10