# Anime analysis

### Dataset: https://www.kaggle.com/datasets/dbdmobile/myanimelist-dataset

# Import libraries

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

## Popular Anime:

### What are the top 10 most popular anime based on user ratings?

In [None]:
# Primero reviso el tipo de dato en las columnas junto con los valores para saber si puedo cambiarlos para ahorrar memoria
# First I check the data type in the columns along with the values to see if I can change them to save memory.
user_ratings_df = pd.read_csv('users-score-2023.csv', nrows=20)

In [None]:
user_ratings_df.info()

In [None]:
user_ratings_df['user_id'] = pd.to_numeric(user_ratings_df['user_id'], downcast='unsigned')
user_ratings_df['anime_id'] = pd.to_numeric(user_ratings_df['anime_id'], downcast='unsigned')
user_ratings_df['rating'] = pd.to_numeric(user_ratings_df['rating'], downcast='unsigned')

user_ratings_df['Username'] = user_ratings_df['Username'].astype('category')
user_ratings_df['Anime Title'] = user_ratings_df['Anime Title'].astype('category')

In [None]:
def get_column_types(df):
    dtypes = df.dtypes
    column_names = dtypes.index
    types = [types.name for types in dtypes.values]
    return dict(zip(column_names, types))

In [None]:
column_types = get_column_types(user_ratings_df)

In [None]:
# Dataframe antes del proceso = 927.9+ MB | Dataframe después del proceso = 242.7 MB
# Dataframe before process = 927.9+ MB | Dataframe after process = 242.7 MB
user_ratings_df = pd.read_csv('users-score-2023.csv',dtype=column_types)

In [None]:
# Elimino aquellos animes que no tengan mas de 200 puntuaciones
# I delete those animes that don't have more than 200 user ratings
min_count = 200
user_ratings_df = user_ratings_df[user_ratings_df.groupby('anime_id')['anime_id'].transform('count') > min_count]

In [None]:
anime_ratings = user_ratings_df.groupby('anime_id')['rating'].mean().to_dict()

In [None]:
top_anime_by_user_rating = sorted(anime_ratings.items(), key=lambda x:x[1], reverse=True)[:5]

In [None]:
top_anime_by_user_rating = [(user_ratings_df[user_ratings_df['anime_id'] == id_val]['Anime Title'].values[0], score) for id_val, score in top_anime_by_user_rating]

In [None]:
top_anime_by_user_rating = pd.DataFrame(top_anime_by_user_rating, columns=['Anime', 'Score'])

In [None]:
top_anime_by_user_rating

## Popular Anime: 
### Is there a correlation between the number of episodes and the ratings of an anime?

In [None]:
# 4.6+ MB | 405.4+ KB
columns = ['anime_id', 'Name', 'Score', 'Genres', 'Episodes', 'Aired', 'Premiered', 'Studios', 'Rating', 'Rank', 'Popularity', 'Favorites', 'Members']
anime_dataset_df = pd.read_csv('anime-dataset-2023.csv', usecols=columns)

In [None]:
anime_dataset_df.info()

In [None]:
# based on anime page 'Not yet aired' and 'R18+' titles are excluded of rank
anime_dataset_df = anime_dataset_df[anime_dataset_df['Rank'] != 'UNKNOWN']

In [None]:
anime_dataset_df['Rank'] = pd.to_numeric(anime_dataset_df['Rank'], downcast='unsigned', errors='coerce')
anime_dataset_df['Score'] = pd.to_numeric(anime_dataset_df['Score'], downcast='unsigned', errors='coerce')
anime_dataset_df['Members'] = pd.to_numeric(anime_dataset_df['Members'], downcast='unsigned', errors='coerce')
anime_dataset_df['anime_id'] = pd.to_numeric(anime_dataset_df['anime_id'], downcast='unsigned', errors='coerce')
anime_dataset_df['Favorites'] = pd.to_numeric(anime_dataset_df['Favorites'], downcast='unsigned', errors='coerce')
anime_dataset_df['Popularity'] = pd.to_numeric(anime_dataset_df['Popularity'], downcast='unsigned', errors='coerce')

In [None]:
anime_dataset = anime_dataset_df[['anime_id', 'Episodes', 'Score']].copy()
anime_dataset['Episodes'] = pd.to_numeric(anime_dataset['Episodes'], errors='coerce')
anime_dataset.dropna(subset=['Episodes'], inplace=True)
anime_dataset.reset_index(drop=True, inplace=True)
anime_dataset['Episodes'] = pd.to_numeric(anime_dataset['Episodes'], downcast='unsigned')

In [None]:
anime_dataset['Score'] = pd.to_numeric(anime_dataset['Score'], downcast='unsigned')

In [None]:
plt.scatter(anime_dataset['Episodes'], anime_dataset['Score'])

In [None]:
anime_dataset['Episodes'].corr(anime_dataset['Score'])

### El valor cercano a 0 (0.063) sugiere una correlación inexistente, por lo que no hay una relación lineal entre la cantidad de episodios y las puntuaciones de los usuarios

## Genre Analysis:
### Which anime genres are most commonly produced?

In [None]:
anime_genres = anime_dataset_df[anime_dataset_df['Genres'] != 'UNKNOWN']['Genres'].copy()

In [None]:
split_values = anime_genres.str.split(',')
flat_values = [item.strip() for sublist in split_values for item in sublist]
qty_genres = pd.Series(flat_values).value_counts().to_frame()

In [None]:
qty_genres.reset_index(inplace=True)

In [None]:
qty_genres.rename(columns={'index': 'Genres'}, inplace=True)

In [None]:
qty_genres

In [None]:
num_bars = len(qty_genres)
random_colors = sns.color_palette('husl', n_colors=num_bars)

qty_genres.plot.bar(x='Genres', y='count', rot=90, figsize=(15,6), color=random_colors, legend=False)

## Genre Analysis:
### Are there certain genres that tend to receive higher ratings?

In [None]:
unique_genres = qty_genres['Genres'].tolist()

In [None]:
anime_genres_score = anime_dataset_df[anime_dataset_df['Genres'] != 'UNKNOWN'][['anime_id', 'Genres', 'Score']].copy()

In [None]:
for col in unique_genres:
    anime_genres_score[col] = 0.0

In [None]:
columns_order = list(anime_genres_score['Genres'])

In [None]:
for i, column in enumerate(columns_order):
    columns_order[i] = columns_order[i].split(', ')

In [None]:
anime_genres_score = anime_genres_score[anime_genres_score['Score'] != 'UNKNOWN'].copy()

In [None]:
anime_genres_score['Score'] = pd.to_numeric(anime_genres_score['Score'], errors='coerce')
anime_genres_score.dropna(subset=['Score'], inplace=True)
anime_genres_score.reset_index(drop=True, inplace=True)

anime_genres_score['Score'] = pd.to_numeric(anime_genres_score['Score'], downcast='unsigned')

In [None]:
for i, row in anime_genres_score.iterrows():
    anime_genres_score.loc[i, columns_order[i]] = row['Score']

In [None]:
anime_genres_score_mean_list = []

In [None]:
for col in unique_genres:
    genre_rating = anime_genres_score[anime_genres_score[col] != 0.0]
    mean_rating = genre_rating[col].mean()
    anime_genres_score_mean_list.append((col, round(mean_rating, 4), len(genre_rating)))

In [None]:
mean_rating_genres = pd.DataFrame(anime_genres_score_mean_list, columns=['Genres', 'mean_rating', 'qty_of_ratings'])

In [None]:
mean_rating_genres.sort_values(by='mean_rating', ascending=False)

In [None]:
colors = plt.cm.rainbow(np.linspace(0, 1, len(mean_rating_genres['Genres'])))

plt.figure(figsize=(17, 10))
plt.scatter(mean_rating_genres['qty_of_ratings'], mean_rating_genres['mean_rating'], s=50, alpha=0.5, c=colors)

# Annotate each point with its genre
for i, genre in enumerate(mean_rating_genres['Genres']):
    plt.annotate(genre, (mean_rating_genres['qty_of_ratings'][i], mean_rating_genres['mean_rating'][i]), textcoords="offset points", xytext=(0,5), ha='center')

plt.title('Mean Rating vs Quantity of Ratings by Genre')
plt.xlabel('Quantity of Ratings')
plt.ylabel('Mean Rating')
plt.grid(True)
plt.show()

## Genre Analysis:
### What are the most popular genres among different genders?

In [None]:
#3.4+GB | 1.3GB
final_anime_dataset_df = pd.read_csv('final_animedataset.csv', nrows=20)

In [None]:
final_anime_dataset_df.info()

In [None]:
final_anime_dataset_df['user_id'] = pd.to_numeric(final_anime_dataset_df['user_id'], downcast='unsigned')
final_anime_dataset_df['anime_id'] = pd.to_numeric(final_anime_dataset_df['anime_id'], downcast='unsigned')
final_anime_dataset_df['my_score'] = pd.to_numeric(final_anime_dataset_df['my_score'], downcast='unsigned')
final_anime_dataset_df['score'] = pd.to_numeric(final_anime_dataset_df['score'], downcast='unsigned')
final_anime_dataset_df['scored_by'] = pd.to_numeric(final_anime_dataset_df['scored_by'], downcast='unsigned')
final_anime_dataset_df['popularity'] = pd.to_numeric(final_anime_dataset_df['popularity'], downcast='unsigned')

final_anime_dataset_df['username'] = final_anime_dataset_df['username'].astype('category')
final_anime_dataset_df['gender'] = final_anime_dataset_df['gender'].astype('category')
final_anime_dataset_df['title'] = final_anime_dataset_df['title'].astype('category')
final_anime_dataset_df['type'] = final_anime_dataset_df['type'].astype('category')
final_anime_dataset_df['source'] = final_anime_dataset_df['source'].astype('category')
final_anime_dataset_df['genre'] = final_anime_dataset_df['genre'].astype('category')

In [None]:
column_types = get_column_types(final_anime_dataset_df)

In [None]:
final_anime_dataset_df = pd.read_csv('final_animedataset.csv', dtype=column_types)

In [None]:
final_anime_dataset_df['rank'] = pd.to_numeric(final_anime_dataset_df['rank'], downcast='unsigned')

In [None]:
final_anime_dataset_df['gender'] = final_anime_dataset_df['gender'].apply(lambda x: 'Other' if (x != 'Male' and x != 'Female') else x)

In [None]:
anime_id_values = {}
for _, grouped in final_anime_dataset_df.groupby('anime_id'):
    gender_count = grouped['gender'].value_counts()
    id_ = grouped['anime_id'].values[0]
    male = gender_count['Male'] if 'Male' in gender_count else 0
    female = gender_count['Female'] if 'Female' in gender_count else 0
    other = gender_count['Other'] if 'Other' in gender_count else 0
    anime_id_values[id_] = [male, female, other]

In [None]:
mask_genres_list = []
for genre in unique_genres:
    mask = anime_genres_score[genre] != 0.0
    mask_genres_list.append(anime_genres_score[mask]['anime_id'].values)

In [None]:
gender_genre = pd.DataFrame(0, index=unique_genres, columns=['Male', 'Female', 'Other'])

In [None]:
for i, mask in enumerate(mask_genres_list):
    matching_values = [anime_id_values[index] for index in mask if index in anime_id_values]
    genre_values = pd.DataFrame(matching_values, columns=gender_genre.columns)
    values_list = list(genre_values.sum())
    gender_genre.loc[unique_genres[i]] = values_list

In [None]:
gender_genre = gender_genre.head(19)

In [None]:
ax = gender_genre.plot(kind='barh', figsize=(19, 10))

ax.set_title('Distribution of Anime Genres by Gender')
ax.set_xlabel('Count')
ax.set_ylabel('Genres')

ax.legend(title='Gender', bbox_to_anchor=(1, 1))

plt.show()

## User Preferences:
### What is the distribution of user ratings for the entire dataset?

In [None]:
anime_ratings_distribution = user_ratings_df[['user_id', 'rating']].copy()

In [None]:
anime_ratings_distribution= anime_ratings_distribution['rating'].value_counts().reset_index()

In [None]:
anime_ratings_distribution.sort_values(by='rating', ascending=True, inplace=True)

In [None]:
num_bars = len(anime_ratings_distribution)
random_colors = sns.color_palette('husl', n_colors=num_bars)
anime_ratings_distribution.plot.bar(x='rating', y='count', color=random_colors, legend=False, figsize=(15, 8))

## Seasonal Trends:
### Do certain genres become more popular during specific seasons?

In [None]:
seasonal_anime = anime_dataset_df[['anime_id', 'Aired', 'Premiered', 'Genres', 'Studios']].copy()

In [None]:
pd.set_option('future.no_silent_downcasting', True)

In [None]:
unknown_premiered_anime = seasonal_anime[seasonal_anime['Premiered'] == 'UNKNOWN'].copy()

In [None]:
unknown_premiered_anime['Aired'] = unknown_premiered_anime['Aired'].apply(lambda x: x.split(',')[0])
unknown_premiered_anime['Aired'] = unknown_premiered_anime['Aired'].apply(lambda x: x.split(' ')[0])

In [None]:
seasons = {
    'Mar': 'spring',
    'Apr': 'spring',
    'May': 'spring',
    'Jun': 'summer',
    'Jul': 'summer',
    'Aug': 'summer',
    'Sep': 'fall',
    'Oct': 'fall',
    'Nov': 'fall',
    'Dec': 'winter',
    'Jan': 'winter',
    'Feb': 'winter'
}

In [None]:
unknown_premiered_anime['Premiered'] =  unknown_premiered_anime['Aired'].map(seasons).fillna(unknown_premiered_anime['Premiered'])

In [None]:
seasonal_anime.update(unknown_premiered_anime)

In [None]:
seasonal_anime['Premiered'] = seasonal_anime['Premiered'].apply(lambda x: x.split(' ')[0])

In [None]:
seasonal_anime = seasonal_anime[seasonal_anime['Premiered'] != 'UNKNOWN']

In [None]:
seasonal_genre_count = {}
for genre in unique_genres:
    seasonal_genre_count[genre] = 0

In [None]:
columns_ = []
seasonal_genres_values_list = []

In [None]:
for _, grouped in seasonal_anime.groupby('Premiered'):
    seasonal_genre_count = {key: 0 for key in seasonal_genre_count}
    columns_.append(grouped['Premiered'].values[0])
    for row in grouped.itertuples():
        for genre in unique_genres:
            if genre in row.Genres.split(', '):
                seasonal_genre_count[genre] += 1
    seasonal_genres_values_list.append(list(seasonal_genre_count.values()))

In [None]:
seasonal_genres = pd.DataFrame(0, index=unique_genres, columns=columns_)

In [None]:
for i, season in enumerate(columns_):
    seasonal_genres[season] = seasonal_genres_values_list[i]

In [None]:
seasonal_genres = seasonal_genres.reset_index()

In [None]:
seasonal_genres_1 = seasonal_genres.iloc[:6]
seasonal_genres_2 = seasonal_genres.iloc[6:12]
seasonal_genres_3 = seasonal_genres.iloc[12:18]
seasonal_genres_4 = seasonal_genres.iloc[18:]

In [None]:
def plot_graphs_genres_season(df, limit_val):
    
    genres = list(df['index'])

    season_names = {
        'fall':df['fall'].to_list(),
        'spring': df['spring'].to_list(),
        'summer': df['summer'].to_list(),
        'winter': df['winter'].to_list(),
    }
    
    x = np.arange(len(genres))  # the label locations
    width = 0.22  # the width of the bars
    multiplier = 0
    
    fig, _ax = plt.subplots(layout='constrained', figsize=(10,6))
    
    
    for attribute, measurement in season_names.items():
        offset = width * multiplier
        rects = _ax.bar(x + offset, measurement, width, label=attribute)
        _ax.bar_label(rects, padding=4, color='black')
        multiplier += 1
    
    # Add some text for labels, title and custom x-axis tick labels, etc.
    _ax.set_ylabel('qty')
    _ax.set_xlabel('Genres')
    _ax.set_title('Genres by season')
    _ax.set_xticks(x + width, genres)
    _ax.legend(loc='upper right', ncols=4)
    _ax.set_ylim(0, limit_val)
    
    plt.show()

In [None]:
plot_graphs_genres_season(seasonal_genres_1, 2100)

In [None]:
plot_graphs_genres_season(seasonal_genres_2, 610)

In [None]:
plot_graphs_genres_season(seasonal_genres_3, 160)

In [None]:
plot_graphs_genres_season(seasonal_genres_4, 50)

## Studios and Directors:
### Which studios or directors have the highest average ratings for their anime?

In [None]:
studios_ratings = anime_dataset_df[['Studios', 'Score', 'Favorites', 'Popularity', 'Rank']].copy()

In [None]:
studios_ratings['Score'] = pd.to_numeric(studios_ratings['Score'], downcast='unsigned', errors='coerce')
studios_ratings.dropna(subset=['Score'], inplace=True)
studios_ratings.reset_index(drop=True, inplace=True)

In [None]:
studios_ratings = studios_ratings.assign(Studios=studios_ratings['Studios'].str.split(', ')).explode('Studios')

In [None]:
studios_ratings_dict = []

In [None]:
for _, grouped in studios_ratings.groupby('Studios'):
    studio = grouped['Studios'].values[0]
    qty_animes = len(grouped)
    mean_rating = grouped['Score'].mean()
    studios_ratings_dict.append((studio, mean_rating, qty_animes))

In [None]:
studios_mean_ratings = pd.DataFrame(studios_ratings_dict, columns=['Studios', 'mean_ratings', 'qty'])
studios_mean_ratings.sort_values(by='mean_ratings', ascending=False, inplace=True)

In [None]:
studios_mean_ratings

## Studios and Directors:
### Is there a correlation between the involvement of a specific studio or director and the success of an anime?

In [None]:
correlation_matrix = studios_ratings[['Score', 'Favorites', 'Popularity', 'Rank']]

correlation_matrix = correlation_matrix[(correlation_matrix['Score'] != 'UNKNOWN')]
correlation_matrix = correlation_matrix[(correlation_matrix['Favorites'] != 'UNKNOWN')]
correlation_matrix = correlation_matrix[(correlation_matrix['Popularity'] != 'UNKNOWN')]
correlation_matrix = correlation_matrix[(correlation_matrix['Rank'] != 'UNKNOWN')]

correlation_matrix = correlation_matrix.corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix: Overall Score vs Favorites')
plt.show()

## User Engagement:
### Are there specific anime that have high engagement but relatively lower ratings?

In [None]:
members_ratings = anime_dataset_df[['anime_id', 'Name', 'Score', 'Members']].copy()

In [None]:
members_ratings.sort_values(['Score', 'Members'],ascending = [True, False], inplace=True)

In [None]:
members_ratings = members_ratings[members_ratings['Members'] > 1000000].head(15)

In [None]:
members_ratings.reset_index(drop=True, inplace=True)

In [None]:
plt.figure(figsize=(15, 10))
plt.scatter(members_ratings['Score'], members_ratings['Members'], color='blue', alpha=0.7)

for i, txt in enumerate(members_ratings['Name']):
    plt.text(members_ratings['Score'][i], members_ratings['Members'][i], txt, fontsize=8, ha='right')

plt.title('Anime Scores vs Members')
plt.xlabel('Score')
plt.ylabel('Members')
plt.grid(True)
plt.show()

## Demographic Analysis:
### Is there a difference in preferences between male and female users?

In [None]:
gender_preference_df = final_anime_dataset_df[(final_anime_dataset_df['gender'] != 'Non-Binary')][['gender', 'genre']].copy()
gender_preference_df = gender_preference_df[~gender_preference_df['genre'].isna()]

In [None]:
male_gender_preference = {}
female_gender_preference = {}

In [None]:
unique_genres = gender_preference_df['genre'].str.split(', ')

In [None]:
from itertools import chain
 
columns_order_genre_unique = set(chain(*unique_genres.tolist()))

In [None]:
columns_order_genre_unique = list(columns_order_genre_unique)

In [None]:
for unique_genre in columns_order_genre_unique:
    male_gender_preference[unique_genre] = 0
    female_gender_preference[unique_genre] = 0

In [None]:
for genre in columns_order_genre_unique:
    mask_male = gender_preference_df['gender'] == 'Male'
    mask_female = gender_preference_df['gender'] == 'Female'
    
    male_pref = gender_preference_df[gender_preference_df['gender'] == 'Male']
    male_gender_preference[genre] = len(male_pref[male_pref['genre'].str.contains(genre)])
    
    female_pref = gender_preference_df[gender_preference_df['gender'] == 'Female']
    female_gender_preference[genre] = len(female_pref[female_pref['genre'].str.contains(genre)])

In [None]:
genres_gender_values = []

In [None]:
for key in male_gender_preference:
    genres_gender_values.append((key, male_gender_preference[key], female_gender_preference[key]))

In [None]:
genres_gender = pd.DataFrame(genres_gender_values, columns=['Genres', 'Male', 'Female'])

In [None]:
genres_gender.sort_values(by=['Male', 'Female'], ascending=False, inplace=True)

In [None]:
genres_gender

In [None]:
ax = genres_gender.plot(x='Genres', y=['Male', 'Female'], kind='bar', figsize=(17, 10))
ax.set_ylabel('Values')
ax.set_title('Grouped Bar Chart by Genre')
plt.xticks(rotation=90, ha='right')
plt.show()