# What is the correlation between the gender of the cast and the popularity of the movie.

In order to find a correlation between gender of actors and popularity we need to define what a 'popular' movie is.


In [None]:
# imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import json
import seaborn as sns; sns.set(color_codes=True)

import holoviews as hv
hv.extension('matplotlib')

In [None]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

In [None]:
# Load dataset
df_movies = pd.read_csv('movie.csv')

# Place genres into a array
df_movies.genres = df_movies.genres.str.split(pat = "|");

# Get the unique genres
movie_genres = df_movies.genres.explode().unique()

## What is popularity

There are a couple of fields that can indicate popularity
- `movie_facebook_likes`
- `num_critic_for_reviews`
- `gross`
- `imdb_score`

In [None]:
print("Total shape", df_movies.shape)
print("After dropping NAs", df_movies.dropna().shape)

print("movie_facebook_likes > 0", df_movies[df_movies.movie_facebook_likes > 0].shape)
print("num_critic_for_reviews > 0", df_movies[df_movies.num_critic_for_reviews > 0].shape)
print("gross > 0", df_movies[df_movies.gross > 0].shape)

## Cast and gender
Our current dataset does not contain data about the gender. We will join the dataset with another dataset from the same source: The Movie Database. First we will need to remove the movie titles trailing spaces.




In [None]:
# Fixing trailing characters
df_movies['movie_title'] = df_movies.movie_title.str.replace('[^\x00-\x7F]','')

Inner joining based on the title of the movie.

In [None]:
# A credits dataset we can join with our movie dataset.
df_credits = pd.read_csv('tmdb_5000_credits.csv')
df_credits = df_credits.rename(columns={'title': 'movie_title'})

# Joining the two datasets
movie_with_cast = pd.merge(df_movies, df_credits, how="inner", on="movie_title")
movie_with_cast.shape

The credits dataset adds a column called `cast`. This column contains an array with objects. Each object represents a actor/actress.
The genders of the actors are stored in the `gender` field in the object. Possible three possible values are:

|Value   | Gender  |
|---|---|
| 0  | Unknown  |
| 1  | Female  |
| 2  | Male  |

Ideally we have one value that represents the share of males and females within the cast of a movie. The first step toward this value is creating a vector for each possible value.

In [None]:
# cast is a nested field, this function will return the gender for the given cast and name.
def actor_to_gender(cast):
    cast = json.loads(cast)
    ratio = [0, 0, 0]
    for actor in cast:
        ratio[actor['gender']] += 1
    return ratio

movie_with_cast['gender_ratio'] = movie_with_cast.apply(lambda movie: actor_to_gender(movie.cast), axis=1)
movie_with_cast['unknown_actors'] = movie_with_cast.apply(lambda movie: movie['gender_ratio'][0], axis=1)
movie_with_cast['female_actors'] = movie_with_cast.apply(lambda movie: movie['gender_ratio'][1], axis=1)
movie_with_cast['male_actors'] = movie_with_cast.apply(lambda movie: movie['gender_ratio'][2], axis=1)
movie_with_cast['total_known_actors'] = movie_with_cast.female_actors + movie_with_cast.male_actors
movie_with_cast.head(3)

In [None]:
male = movie_with_cast.male_actors;
plt.hist(male, bins=range(0, 50, 1))
plt.title("Frequency male")
plt.show();

female = movie_with_cast.female_actors
plt.title("Frequency female")
plt.hist(female, bins=range(0, 50, 1))
plt.show();

unknown = movie_with_cast.unknown_actors
plt.title("Frequency unknown")
plt.hist(unknown, bins=range(0, 50, 1))
plt.show();

In [None]:
movie_with_cast['ratio'] = movie_with_cast.male_actors / (movie_with_cast.male_actors + movie_with_cast.female_actors)
movie_with_cast.title_year = movie_with_cast.title_year.astype(int)

filtered = movie_with_cast.dropna();
filtered[['ratio', 'gender_ratio', 'imdb_score', 'gross', 'movie_facebook_likes', 'num_critic_for_reviews', 'male_actors' , 'female_actors', 'total_known_actors']].describe()
filtered = filtered[filtered.total_known_actors >= 20]

filtered.describe()

In [None]:
sns.regplot(x="ratio", y="imdb_score", data=filtered);


In [None]:
from sklearn.metrics import r2_score
r2_score(filtered["imdb_score"], filtered["ratio"])

In [None]:
def load_genre(genre, **kwargs):
    genre_filtered = filtered[filtered.apply(lambda m: genre in m.genres, axis=1)]
    return hv.Scatter(genre_filtered[['ratio', 'imdb_score']])

genres = movie_genres
dmap = hv.DynamicMap(load_genre, kdims='Genre').redim.values(Genre=genres)
dmap

