In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Task 1
### Explore and describe the data

In [None]:
dir_name = "05_ComicBook"
anime = pd.read_csv(f"{dir_name}/anime.csv")
rating = pd.read_csv(f"{dir_name}/rating.csv")

# print(anime.head())
# print(rating.head())

# NAMBER OF ANIME, NAN VALUES
n_anime = anime.shape[0]
print(f"Number of anime: {n_anime}")

n_user = rating["user_id"].nunique()
print(f"Number of users: {n_user}")

print(anime.isna().sum())
print(rating.isna().sum())

anime = anime.dropna()
rating = rating.dropna()




# GENRES AND TYPE DISTRIBUTION

genres = anime["genre"].str.split(", ")
genres = genres.explode()
genres = genres.value_counts()

plt.figure(figsize=(12, 6))
genres.plot(kind="bar", color='skyblue', alpha=0.7)
plt.xlabel("Genre")
plt.ylabel("Count")
plt.title("Genres distribution")
plt.show()

plt.figure(figsize=(12, 6))
anime["type"].value_counts().plot(kind="bar", color='skyblue', alpha=0.7)
plt.xlabel("Type")
plt.ylabel("Count")
plt.title("Type distribution")
plt.show()

print("mean rating is: ", rating["rating"].mean())





# POPULARITY VS RATING MEAN

anime = pd.read_csv(f"{dir_name}/anime.csv")
rating = pd.read_csv(f"{dir_name}/rating.csv")
rating = rating.merge(anime, on="anime_id")

rating_popularity = rating.groupby("anime_id")["rating_x"].count().reset_index(name="popularity")
rating = rating.merge(rating_popularity, on="anime_id")
rating = rating[rating["popularity"] > 1000]

plt.figure(figsize=(12, 6))
plt.scatter(rating["popularity"], rating["rating_y"], color='skyblue', alpha=0.7) 
z = np.polyfit(rating["popularity"], rating["rating_y"], 1)
p = np.poly1d(z)
plt.plot(rating["popularity"], p(rating["popularity"]), "r--")
plt.xlabel("Popularity")
plt.ylabel("Rating mean")
plt.title("Popularity vs Rating mean")
plt.legend(["Trend line"])
plt.show()

corr = rating["popularity"].corr(rating["rating_y"])
print(f"Correlation between popularity and rating mean: {corr}")




# USER RATING MEAN DISTRIBUTION AND STD
# not count -1 for rating mean and std

rating = rating[rating["rating_x"] != -1]
rating_mean = rating.groupby("user_id")["rating_x"].mean()
rating_std = rating.groupby("user_id")["rating_x"].std()

plt.figure(figsize=(12, 6))
plt.hist(rating_mean, bins=50, color='skyblue')

plt.axvline(rating_mean.mean(), color="red", linestyle="dashed", linewidth=1)

plt.axvline(rating_mean.mean() + rating_mean.std(), color="green", linestyle="dashed", linewidth=1)
plt.axvline(rating_mean.mean() - rating_mean.std(), color="green", linestyle="dashed", linewidth=1)

plt.xlabel("Rating mean")
plt.ylabel("Count")
plt.title("Rating mean distribution")

plt.legend(["Mean", "Mean +- Std"])

plt.show()

print("mean rating is: ", rating_mean.mean())




# Task 2
### Pre-process the data 

In [None]:
anime = pd.read_csv(f"{dir_name}/anime.csv")
rating = pd.read_csv(f"{dir_name}/rating.csv")


# STEP 1: fill missing rating using user rating
# if we have this reting we can use

rating = rating.dropna(subset=['rating'])
anime_with_nan_rating = anime[anime['rating'].isna()]
rating_for_nan_anime = rating[rating['anime_id'].isin(anime_with_nan_rating['anime_id'])]
user_rating_counts = rating_for_nan_anime.groupby('anime_id')['user_id'].count().reset_index(name='user_rating_count')
anime_with_user_ratings = user_rating_counts[user_rating_counts['user_rating_count'] > 0]
# get the mean rating of the user that rated the anime, and fill onlythe missing rating
user_avg_ratings = rating.groupby('anime_id')['rating'].mean().reset_index(name='user_rating_mean')
anime = anime.merge(user_avg_ratings, on='anime_id', how='left')
anime['rating'] = anime['rating'].combine_first(anime['user_rating_mean'])
anime.drop('user_rating_mean', axis=1, inplace=True)

# STEP 2: remove missing rating 
# remaining missing rating are on anime that have no rating from any user
# so we can remove them because don't have any information for us
anime = anime.dropna(subset=['rating'])


# STEP 3: remove all other missing values
anime = anime.dropna()





