## Install Dependencies

In [None]:
# pip install pandas

In [None]:
# pip install fastparquet

## Data Visualization

In [None]:
# pip install pyarrow

In [None]:
# pip install requests


In [None]:
# import pandas as pd

# animes_df = pd.read_csv('dataset/anime-dataset-2023.csv')
# ratings_df = pd.read_csv('dataset/users-score-2023.csv')
# animes_df.to_parquet('dataset/animes.parquet')
# ratings_df.to_parquet('dataset/user_ratings.parquet')

In [None]:
# import pandas as pd

# animes_df = pd.read_parquet('dataset/animes.parquet')
# ratings_df = pd.read_parquet('dataset/user_ratings.parquet')

In [None]:
import pandas as pd

ratings_df = pd.read_parquet('dataset/user_ratings.parquet')

animes_df = pd.read_csv("dataset/anime-dataset-2023.csv")
animes_df2 = pd.read_csv("dataset/anime-filtered.csv")

In [None]:
animes_df.shape

In [None]:
animes_df2.shape

In [None]:
df2_genres = animes_df2[['anime_id', 'Genres']].copy()
animes_df = pd.merge(animes_df, df2_genres, on='anime_id', how='left', suffixes=('', '_df2'))
animes_df['Genres'] = animes_df['Genres_df2'].combine_first(animes_df['Genres'])
animes_df = animes_df.drop(columns=['Genres_df2'])

In [None]:
import gc

del df2_genres, animes_df2
gc.collect()

In [None]:
animes_df.info()

In [None]:
ratings_df.info()

In [None]:
animes_df.head()

In [None]:
animes_df = animes_df[['anime_id','Name','Score','Genres','Synopsis','Type','Aired','Premiered','Producers','Licensors','Studios','Rating','Members','Image URL']]
animes_df

In [None]:
animes_df.info()

In [None]:
ratings_df.head()

## Data Cleaning

#### Anime dataset

In [None]:
# Check for empty values
animes_df.isna().sum()

In [None]:
# Check for duplicates
animes_df.duplicated().sum()

In [None]:
unknown_counts = (animes_df == "UNKNOWN").sum()
print(unknown_counts)


In [None]:
animes_df.shape

In [None]:
unique_genres = set(
    genre.strip() for genres in animes_df["Genres"].dropna() for genre in genres.split(",")
)

unique_genres = sorted(unique_genres)

print(unique_genres)  
print(f"Total unique genres: {len(unique_genres)}")


In [None]:
animes_df = animes_df[~animes_df['Genres'].str.contains('Yaoi|Erotica|Boys Love|Girls Love|Hentai|UNKNOWN', na=False, case=False)]

In [None]:
unique_genres = set(
    genre.strip() for genres in animes_df["Genres"].dropna() for genre in genres.split(",")
)

unique_genres = sorted(unique_genres)

print(unique_genres)  
print(f"Total unique genres: {len(unique_genres)}")

In [None]:
animes_df.shape

In [None]:
animes_df

In [None]:
animes_df = animes_df[animes_df['Members'] > 100]

In [None]:
animes_df.shape

In [None]:
animes_df = animes_df[~animes_df['Score'].str.contains('UNKNOWN')]

In [None]:
animes_df.shape

In [None]:
animes_df

In [None]:
unknown_counts = (animes_df == "UNKNOWN").sum()
print(unknown_counts)


In [None]:
animes_df[animes_df['Rating'] == 'UNKNOWN']

#### User ratings

In [None]:
ratings_df = ratings_df.rename(columns={"Anime Title": "anime_name"})

In [None]:
ratings_df.shape

In [None]:
ratings_df['user_id'].nunique()

In [None]:
ratings_df.isna().sum()

In [None]:
test = ratings_df[ratings_df['Username'].isna()]
test

In [None]:
test[test['user_id'] == 20930]

In [None]:
ratings_df.loc[ratings_df['user_id'] == 20930, 'Username'] = 'KJYit'


In [None]:
ratings_df[ratings_df['user_id'] == 20930]

In [None]:
ratings_df.isna().sum()

In [None]:
ratings_df.duplicated().sum()

In [None]:
ratings_df.shape

#### Get rows in ratings where id exist in anime df 

In [None]:
animes_df.shape

In [None]:
animes_df = animes_df[animes_df['anime_id'].isin(ratings_df['anime_id'])]

In [None]:
animes_df.shape

In [None]:
ratings_df.shape

In [None]:
ratings_df = ratings_df[ratings_df['anime_id'].isin(animes_df['anime_id'])]

In [None]:
ratings_df.shape

In [None]:
ratings_df[ratings_df['anime_id'] == 20]

In [None]:
animes_df[animes_df['anime_id'] == 20]

In [None]:
# Get the unique anime IDs from animes_df
anime_ids = animes_df['anime_id'].unique().tolist()

counterTrue = 0
counterFalse = 0
mismatch_ids = []  # list to store mismatched anime_ids

for anime_id in anime_ids:
    # Get unique names associated with this anime_id in each DataFrame as sets
    names_animes = animes_df[animes_df['anime_id'] == anime_id].Name.unique()[0]
    names_ratings = ratings_df[ratings_df['anime_id'] == anime_id].anime_name.unique()[0]
    
    print(f"Checking {anime_id}")

    # Check if both sets match
    if names_animes == names_ratings:
        counterTrue += 1
    else:
        counterFalse += 1
        mismatch_ids.append(anime_id)

print("Matches (True):", counterTrue)
print("Mismatches (False):", counterFalse)
print("Mismatch IDs:", mismatch_ids)


In [None]:
test1 = str(animes_df[animes_df['anime_id'] == 53367].Name)

In [None]:
test2 = str(ratings_df[ratings_df['anime_id'] == 53367].head(1).anime_name)

In [None]:
if test1 == test2:
    print("Yes")
else:
    print("No")

print(test1)
print(test2)

### Check if images if available

In [None]:
import requests

# Define the function to check the URL
def check_image_url(url):
    try:
        # Use a HEAD request to avoid downloading the entire image.
        response = requests.head(url, timeout=5)
        # Check if the response is successful and the Content-Type header indicates an image.
        if response.status_code == 200 and 'image' in response.headers.get('Content-Type', ''):
            return True
        else:
            return False
    except requests.RequestException:
        return False

In [None]:
# Loop through every row and print the progress and check result
results = []
for idx, row in animes_df.iterrows():
    url = row['Image URL']
    result = check_image_url(url)
    results.append(result)
    print(f"Row {idx}: Name: {row['Name']}, URL: {url}, Image Good: {result}")

# Add the results as a new column in the sample DataFrame
animes_df['Image Good'] = results

# Filter to get rows with bad images (where 'Image Good' is False) and print them
bad_images_sample = animes_df[~animes_df['Image Good']]
print("\nRows with bad image URLs:")
print(bad_images_sample[['Name', 'Image URL']])

In [None]:
animes_df.loc[animes_df['Name'] == 'Air', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1825/146531.jpg'
animes_df.loc[animes_df['Name'] == 'Samurai Champloo', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1370/135212.jpg'
animes_df.loc[animes_df['Name'] == 'Pokemon', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1787/140239.jpg'
animes_df.loc[animes_df['Name'] == 'Dragon Ball Z', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1277/142022.jpg'
animes_df.loc[animes_df['Name'] == 'Shin Chou Kyou Ryo: Condor Hero', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1698/139204.jpg'
animes_df.loc[animes_df['Name'] == 'Kimagure Orange☆Road', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1960/142715.jpg'
animes_df.loc[animes_df['Name'] == 'Macross Plus Movie Edition', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1487/133799.jpg'
animes_df.loc[animes_df['Name'] == 'Gall Force: The Revolution', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1078/95285.jpg'
animes_df.loc[animes_df['Name'] == 'Jibaku-kun', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1630/143498.jpg'
animes_df.loc[animes_df['Name'] == 'One Piece: Jango no Dance Carnival', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1370/135212.jpg'
animes_df.loc[animes_df['Name'] == 'Super Kuma-san', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1264/146204.jpg'
animes_df.loc[animes_df['Name'] == 'Dorami-chan: Wow, The Kid Gang of Bandits', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1921/146378.jpg'
animes_df.loc[animes_df['Name'] == 'Doraemon Movie 05: Nobita no Makai Daibouken', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/2/72410.jpg'
animes_df.loc[animes_df['Name'] == 'Yondemasu yo, Azazel-san. (TV)', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/4/75284.jpg'
animes_df.loc[animes_df['Name'] == 'Brave 10', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1704/143834.jpg'
animes_df.loc[animes_df['Name'] == 'Chibi☆Devi!', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/6/35927.jpg'
animes_df.loc[animes_df['Name'] == 'Recorder to Randoseru Do♪', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/13/33003.jpg'
animes_df.loc[animes_df['Name'] == 'Pokemon Best Wishes! Season 2', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1904/140254.jpg'
animes_df.loc[animes_df['Name'] == 'Kamisama no Inai Nichiyoubi', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/2/52127.jpg'
animes_df.loc[animes_df['Name'] == 'Pokemon Best Wishes! Season 2: Episode N', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1922/140256.jpg'
animes_df.loc[animes_df['Name'] == 'SoniAni: Super Sonico The Animation', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/3/56987.jpg'
animes_df.loc[animes_df['Name'] == 'Sanzoku no Musume Ronja', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/11/68125.jpg'
animes_df.loc[animes_df['Name'] == 'The iDOLM@STER Cinderella Girls', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1345/144832.jpg'
animes_df.loc[animes_df['Name'] == 'Yoru no Yatterman', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/11/71773.jpg'
animes_df.loc[animes_df['Name'] == 'Motion Lumine', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/9/72634.jpg'
animes_df.loc[animes_df['Name'] == 'Makura no Danshi', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/11/75142.jpg'
animes_df.loc[animes_df['Name'] == 'ClassicaLoid', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/2/88678.jpg'
animes_df.loc[animes_df['Name'] == 'Sushi Police', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/13/76236.jpg'
animes_df.loc[animes_df['Name'] == 'Active Raid: Kidou Kyoushuushitsu Dai Hachi Gakari', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/3/77977.jpg'
animes_df.loc[animes_df['Name'] == 'Bubuki Buranki', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/7/79758.jpg'
animes_df.loc[animes_df['Name'] == 'Norn9: Norn+Nonet - Unmei no Megami', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/8/78965.jpg'
animes_df.loc[animes_df['Name'] == 'Magic-Kyun! Renaissance', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/7/80828.jpg'
animes_df.loc[animes_df['Name'] == 'Atom: The Beginning', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/6/86607.jpg'
animes_df.loc[animes_df['Name'] == 'Soul Buster', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/3/82256.jpg'
animes_df.loc[animes_df['Name'] == 'Meiji Tokyo Renka', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1280/145923.jpg'
animes_df.loc[animes_df['Name'] == 'Room Mate', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/10/85196.jpg'
animes_df.loc[animes_df['Name'] == 'Clione no Akari', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/3/84706.jpg'
animes_df.loc[animes_df['Name'] == 'Sengoku Night Blood', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/4/88329.jpg'
animes_df.loc[animes_df['Name'] == 'Shoujo☆Kageki Revue Starlight', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1165/93552.jpg'
animes_df.loc[animes_df['Name'] == 'Sora to Umi no Aida', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1297/97156.jpg'
animes_df.loc[animes_df['Name'] == 'RobiHachi', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1555/100447.jpg'
animes_df.loc[animes_df['Name'] == 'SD Gundam World: Sangoku Souketsuden', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1491/102275.jpg'
animes_df.loc[animes_df['Name'] == 'Lion meets HachiClo', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1565/97675.jpg'
animes_df.loc[animes_df['Name'] == 'Beastars', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1713/145599.jpg'
animes_df.loc[animes_df['Name'] == 'Keishichou Tokumubu Tokushu Kyouakuhan Taisakushitsu Dainanaka: Tokunana OVA', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1156/146362.jpg'
animes_df.loc[animes_df['Name'] == 'Xue Ying Ling Zhu 2nd Season', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1270/138692.jpg'
animes_df.loc[animes_df['Name'] == 'Heikousen', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1146/142141.jpg'
animes_df.loc[animes_df['Name'] == 'Tunshi Xingkong 2nd Season', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1558/117008.jpg'
animes_df.loc[animes_df['Name'] == 'Voy@ger', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1280/142023.jpg'
animes_df.loc[animes_df['Name'] == 'Skip to Loafer', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1518/138730.jpg'
animes_df.loc[animes_df['Name'] == 'Forever Rain', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1539/137988.jpg'
animes_df.loc[animes_df['Name'] == 'Muv-Luv Alternative 2nd Season', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1253/127326.jpg'
animes_df.loc[animes_df['Name'] == 'Shen Mu', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1965/138663.jpg'
animes_df.loc[animes_df['Name'] == 'Romantic Killer', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1764/142001.jpg'
animes_df.loc[animes_df['Name'] == 'Tom to Jerry (2022)', 'Image URL'] = 'https://cdn.myanimelist.net/images/anime/1653/146834.jpg'

In [None]:
bad_images_sample[['Name', 'Image URL']]

In [None]:
bad_images_sample.shape

In [None]:
# Filter the DataFrame for Samurai Champloo and then check the URL
samurai_url_status = animes_df[animes_df['Name'] == 'Air']['Image URL'].apply(check_image_url)
print(samurai_url_status)


### After checking 8 of the ids, it can be concluded that they are actually the same shows, it just that due to symbols that they got flag as different resulting in false negatives

In [None]:
animes_df.to_parquet('dataset/cleaned_animes.parquet')
ratings_df.to_parquet('dataset/cleaned_user_ratings.parquet')