In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

############### Helper functions ###############

def count_unique_elements(df: pd.DataFrame, col: str) -> int:
    col_series = df[col].dropna()
    all_different_value_col = col_series.str.split(',').explode().str.strip() # Split each genre string by comma and flatten the list
    unique_value_col = all_different_value_col.unique()
    print(f"\nNumber of unique {col}:", len(unique_value_col))
    print(f"Unique {col}:", unique_value_col)

############### TASK 1 (Explore and describe the data) ###############

# --------- Analyze anime.csv --------- 
# 1. Load the dataset
anime_df = pd.read_csv('data/anime.csv')
# 2. Basic checks
print("Number of columns in anime.csv: ", len(anime_df.columns))
print(anime_df.columns)
print("\nNumber of rows in anime.csv: ", len(anime_df))
# check if anime_id contains duplicates
anime_id_duplicates = anime_df['anime_id'].duplicated().sum()
print("\nNumber of duplicate anime_id in anime.csv: ", anime_id_duplicates)
# check unique genres
count_unique_elements(anime_df, 'genre')
# check unique type
count_unique_elements(anime_df, 'type')
# check rows that contain missing values, if same row has multiple missing values count only once
missing_values = anime_df.isnull().sum(axis=1) > 0
missing_values_count = missing_values.sum()
print("\nNumber of rows with missing values in anime.csv: ", missing_values_count)
print("\nRows with missing values in anime.csv:\n", anime_df[missing_values].head(5))
# check col with most missing values
missing_values_count = anime_df.isnull().sum()
missing_values_count = missing_values_count[missing_values_count > 0]
missing_values_count = missing_values_count.sort_values(ascending=False)
print("\nColumns with missing values in anime.csv:\n", missing_values_count)
# check cols with 'Unknown' as value
unknown_values_count = anime_df.isin(['Unknown']).sum()
unknown_values_count = unknown_values_count[unknown_values_count > 0]
unknown_values_count = unknown_values_count.sort_values(ascending=False)
print("\nColumns with 'Unknown' as value in anime.csv:\n", unknown_values_count)
# check anime with most episodes, ignore the ones with 'Unknown' as value
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')
most_episodes = anime_df[anime_df['episodes'] != 'Unknown'].sort_values(by='episodes', ascending=False).head(3)
print("\nTop 3 anime with most episodes:\n", most_episodes)
# chec anime with least episodes, ignore the ones with 'Unknwon' as value and 1 episode (film)
least_episodes = anime_df[anime_df['episodes'] != 'Unknown']
least_episodes = least_episodes[least_episodes['episodes'] != 1]
least_episodes = least_episodes.sort_values(by='episodes', ascending=True).head(3)
print("\nTop 3 anime with least episodes:\n", least_episodes)
# check anime with most members
anime_df['members'] = pd.to_numeric(anime_df['members'], errors='coerce')
most_members = anime_df.sort_values(by='members', ascending=False).head(3)
print("\nTop 3 anime with most members:\n", most_members)
# check anime with least members
least_members = anime_df[anime_df['members'] != 'Unknown']
least_members = least_members[least_members['members'] != 0]
least_members = least_members.sort_values(by='members', ascending=True).head(3)
print("\nTop 3 anime with least members:\n", least_members)
# check anime with highest rating
anime_df['rating'] = pd.to_numeric(anime_df['rating'], errors='coerce')
highest_rating = anime_df.sort_values(by='rating', ascending=False).head(3)
print("\nTop 3 anime with highest rating:\n", highest_rating)
# check anime with lowest rating
lowest_rating = anime_df[anime_df['rating'] != 0]
lowest_rating = lowest_rating.sort_values(by='rating', ascending=True).head(3)
print("\nTop 3 anime with lowest rating:\n", lowest_rating)


# --------- Analyze rating.csv --------- 
# 1. Load the dataset
rating_df = pd.read_csv('data/rating.csv') 
# 2. Basic checks
print("Number of columns in rating.csv: ", len(rating_df.columns))
print(rating_df.columns)
print("\nNumber of rows in rating.csv: ", len(rating_df))

Number of columns in anime.csv:  7
Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

Number of rows in anime.csv:  12294

Number of duplicate anime_id in anime.csv:  0

Number of unique genre: 43
Unique genre: ['Drama' 'Romance' 'School' 'Supernatural' 'Action' 'Adventure' 'Fantasy'
 'Magic' 'Military' 'Shounen' 'Comedy' 'Historical' 'Parody' 'Samurai'
 'Sci-Fi' 'Thriller' 'Sports' 'Super Power' 'Space' 'Slice of Life'
 'Mecha' 'Music' 'Mystery' 'Seinen' 'Martial Arts' 'Vampire' 'Shoujo'
 'Horror' 'Police' 'Psychological' 'Demons' 'Ecchi' 'Josei' 'Shounen Ai'
 'Game' 'Dementia' 'Harem' 'Cars' 'Kids' 'Shoujo Ai' 'Hentai' 'Yaoi'
 'Yuri']

Number of unique type: 6
Unique type: ['Movie' 'TV' 'OVA' 'Special' 'Music' 'ONA']

Number of rows with missing values in anime.csv:  277

Rows with missing values in anime.csv:
       anime_id                                       name genre     type  \
2844     33242  IS: Infinite Stratos 2 - Infinite Weddi