In [1]:
import pandas as pd

# Specify the path to your movie dataset CSV file
file_path = '16k_Movies.csv'

# Read the CSV file into a DataFrame
movies = pd.read_csv(file_path)

# Clean up any leading/trailing whitespace in string columns
movies.columns = movies.columns.str.strip()  # Remove any whitespace from the column names
movies = movies.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Check if 'Rating' and 'Genres' columns exist and are correctly named
if 'Rating' not in movies.columns or 'Genres' not in movies.columns:
    print("Error: Required columns 'Rating' or 'Genres' not found in the dataset. Please check the column names.")
else:
    # Convert the 'Rating' column to numeric, handling any errors by coercing invalid values to NaN
    movies['Rating'] = pd.to_numeric(movies['Rating'], errors='coerce')

    # Split the 'Genres' column into individual genres
    movies['Genres'] = movies['Genres'].apply(lambda x: x.split('|') if pd.notna(x) else [])

    # Explode the 'Genres' column to have one genre per row per movie
    movies_exploded = movies.explode('Genres')

    # Group by 'Genres' and calculate the average rating per genre
    genre_data = movies_exploded.groupby('Genres').agg(
        average_rating=pd.NamedAgg(column='Rating', aggfunc='mean')
    ).reset_index()

    # Find the genre with the highest average rating
    highest_avg_rating_genre = genre_data.loc[genre_data['average_rating'].idxmax()]

    # Output the results
    print("Average rating by genre:")
    print(genre_data)
    print("\nGenre with the highest average rating:")
    print(highest_avg_rating_genre['Genres'], "with an average rating of:", highest_avg_rating_genre['average_rating'])


Average rating by genre:
                                                 Genres  average_rating
0                                                Action        5.825000
1                                      Action,Adventure        6.854545
2     Action,Adventure,Biography,Crime,Drama,History...        6.400000
3                      Action,Adventure,Biography,Drama             NaN
4              Action,Adventure,Biography,Drama,History        6.800000
...                                                 ...             ...
1658                                      Unknown,Drama        6.800000
1659               Unknown,Drama,Family,Fantasy,Musical             NaN
1660                                                War             NaN
1661                           War,Drama,Action,Romance        4.400000
1662                                            Western             NaN

[1663 rows x 2 columns]

Genre with the highest average rating:
Adventure,Drama,Mystery,Romance,Thriller with 