In [4]:
import pandas as pd
from collections import Counter

def map_chunk(df_chunk, genres_col="Genres"):
    counter = Counter()
    col = df_chunk[genres_col].fillna("")
    for genre_raw in col:
        if not genre_raw:
            continue
        parts = str(genre_raw).replace(";", ",").split(",")
        for genre in (part.strip() for part in parts):
            if genre:
                counter[genre] += 1
    return counter

def shuffle(mapped_counters):
    total = Counter()
    for counter in mapped_counters:
        total.update(counter)
    return total

def reduce(shuffled_counter):
    return shuffled_counter.most_common()

def mapReduce(file_path, chunksize=200_000, genres_col="Genres", anime_col="anime_id"):
    mapped = []
    seen_global = set()  

    for chunk in pd.read_csv(
        file_path,
        usecols=[genres_col, anime_col],
        dtype={genres_col: "string", anime_col: "string"},  
        chunksize=chunksize,
        low_memory=True,
        engine="c"
    ):
        chunk = chunk.dropna(subset=[anime_col])
        mask_new = ~chunk[anime_col].isin(seen_global)
        chunk_new = chunk.loc[mask_new]

        if not chunk_new.empty:
            mapped.append(map_chunk(chunk_new, genres_col))
            seen_global.update(chunk_new[anime_col].unique())

    shuffled = shuffle(mapped)
    result = reduce(shuffled)
    return result

def main():
    file_path = r"C:\Users\Rodion\Documents\BigDataPy\final_anime_ratings.csv"

    result = mapReduce(file_path)

    print("ТОП-10 жанров по количеству уникальных аниме:")
    for genre, count in result[:10]:
        print(f"{genre}: {count}")

    if result:
        top_genre, top_count = result[0]
        print(f"\nСамый частый жанр: {top_genre} — {top_count} уникальных тайтлов")

if __name__ == "__main__":
    main()

ТОП-10 жанров по количеству уникальных аниме:
Comedy: 88954
Action: 68298
Fantasy: 46079
Drama: 44535
Romance: 42569
Sci-Fi: 38442
Shounen: 38381
School: 37752
Adventure: 37223
Supernatural: 34326

Самый частый жанр: Comedy — 88954 уникальных тайтлов
