# Task B: Visualisation of Income, Rating, and Category

The top 99 highest rated IMDB movies are visualised in this demonstration. We display how the rating of the movies varies depending upon the income of the movie, how may movies from this list were released in the year, and also allow the user to filter the movies based on the categories.

## Instructions

1. All the categories are enabled by default. You can remove those you are not interested in and the plots will reflect your choice. Use the sidebar on the left for this.
2. There is also a slider that changes the height of the charts.
3. Hovering over the points will show related information on tooltips.
4. The scatter plot allows rectangular selection and the bar chart allows interval selection. The brushes selectively highlight on both plots. The selection area can be moved around by dragging. The size can be changed by scrolling on the selected area.
5. Click outside the selection area to deselect and reset the plot.

In [1]:
%%capture
# Code comments are not visible to the end-user
# !pip install altair
# !pip install mercury
# !pip install pandas

In [2]:
# Import the necessary packages
import altair as alt  # For interactive and custom visuals
import pandas as pd   # For working with the dataset
import mercury as mr  # For additional input and filtering capabilities

In [3]:
# We load in the data set from the CSV file
movies = pd.read_csv('data/imdb100/movies.csv')
# movies.dtypes

In [4]:
def clean_income(income):
    """
    Removes the parentheses and returns only the year integer value
    """
    if(isinstance(income, float)):
        income = '$0.0M'
    return income[1:-1]

In [5]:
# Assign the cleaned gross amount to a new column
movies['Income (MUSD)'] = movies['gross_total'].apply(clean_income)
# movies['Income (MUSD)'] = movies['Income (MUSD)'].astype(float)
# movies.dtypes

In [6]:
# Remove the extra columns
movies.drop(columns=[
    'votes', 'gross_total', 'year_of_release', 'run_time', 'genre', 
], inplace=True)

# Rename the rest of the columns to cleaner names
movies.rename(columns={
    'index': 'Rank', 'movie_name': 'Movie Name', 'category': 'Category', 'imdb_rating': 'Rating'
}, inplace=True)

# Assign types for easier type inference
movies = movies.astype({
    'Rank': 'int',
    'Movie Name': 'string',
    'Category': 'string',
    'Income (MUSD)': 'float',
    'Rating': 'float'
})
# movies.dtypes

In [7]:
# Extract all individual categories present in the Category column values
categories = list(set(movies['Category']))
# categories

In [8]:
def apply_category(text, category):
    return category == text

# Create one-hot encoding for genres
for category in categories:
    movies[category] = movies['Category'].apply(apply_category, category=category)

# movies.head(20)

In [9]:
app = mr.App(
    description="A visualisation with interactive plots showing the relationship between income, rating, and category.",
    show_code=False,
    show_prompt=False,
    continuous_update=True,
    static_notebook=False,
    show_sidebar=True,
    full_screen=True,
    allow_download=False,
)

In [10]:
# Chart configuration parameters
chart_height = mr.Slider(value=450, min=450, max=700, label="Chart height", step=20)
chart_width = 750

mercury.Slider

In [11]:
# Category filtering multi-selection tool
selected_categories = mr.MultiSelect(label="Select category(s)", value=list(categories), choices=list(categories))

mercury.MultiSelect

In [12]:
# selected_categories.value

In [13]:
# Filter based on the categories
indices = movies['Rank'] > 100
for g in selected_categories.value:
    indices |= movies[g]
movies['Highlight'] = False
movies.loc[indices, 'Highlight'] = True
# movies.head(20)

In [14]:
# We retain a filtered version based on the selected genres.
filtered_movies = movies[movies['Highlight']]
# filtered_movies

In [15]:
# The information we want to display on tooltips
tooltip_cols = ['Movie Name', 'Rank', 'Category', 'Rating','Income (MUSD)']

# The selection mechanisms on the graphs
brush = alt.selection_interval(encodings=['x'])
# single = alt.selection_single()

# The graph opacity condition for local selection (brush and click)
# opacity = alt.condition(brush | single, alt.value(0.9), alt.value(0.1))
opacity = alt.condition(brush, alt.value(0.9), alt.value(0.1))

# The chart for scatter plot of the ratings of the movies and their gross_total
points = alt.Chart(movies).mark_point(filled=True).encode(
    x=alt.X('Rating',
        scale=alt.Scale(zero=False, padding=8),
    ),
    y=alt.Y('Income (MUSD)',
        scale=alt.Scale(zero=False, padding=25),
        
    ),
    tooltip=tooltip_cols,
    color=alt.condition(
        alt.datum['Highlight'],
        alt.Color('Rank:Q',
            scale=alt.Scale(scheme='yellowgreenblue')
        ),
        
        alt.value('grey')
    ),
    size='Rating',
    opacity=opacity
).add_selection(
    brush,
).properties(
    width=chart_width,
    height=chart_height.value
)

line = alt.Chart(movies).mark_rule(color='firebrick').encode(
#     x=alt.X('Rating',
#         scale=alt.Scale(zero=False, padding=8),
#     ),
    y=alt.Y('mean(Income (MUSD)):Q'),
    size=alt.SizeValue(3)
).transform_filter(
    brush,
).properties(
    width=chart_width,
    height=chart_height.value
)



if selected_categories.value:
    average = alt.Chart(filtered_movies).mark_line().encode(
        alt.X('Rating',
            scale=alt.Scale(padding=8),
        ),
        alt.Y('mean(Income (MUSD)):Q',
#             scale=alt.Scale(padding=25)
        ),
#         tooltip=tooltip_cols,
        color=alt.value('purple'),   
        opacity=opacity,
    ).add_selection(
        brush,
    ).properties(
        width=chart_width,
        height=chart_height.value
    )

# Vertically stack the two charts
top = (points + line)
final = (top & average) if selected_categories.value else top
final.configure_axis(
    labelFontSize=10,
    titleFontSize=20
)


In [16]:
# Display the filtered table of movies
if selected_categories.value:
    display_movies = filtered_movies.set_index('Rank')
display_movies[['Movie Name', 'Category', 'Rating','Income (MUSD)']] if selected_categories.value else None

Unnamed: 0_level_0,Movie Name,Category,Rating,Income (MUSD)
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,The Godfather,R,9.2,134.97
2,The Silence of the Lambs,R,8.6,130.74
3,Star Wars: Episode V - The Empire Strikes Back,PG,8.7,290.48
4,The Shawshank Redemption,R,9.3,28.34
5,The Shining,R,8.4,44.02
...,...,...,...,...
95,The Usual Suspects,R,8.5,23.34
96,Cool Hand Luke,GP,8.1,16.22
97,Eternal Sunshine of the Spotless Mind,R,8.3,34.40
98,City Lights,G,8.5,0.02


Made with ❤️ by Group 18.

February 2023