## What is the correlation between the gender of actors and the popularity of the movie.

In order to find a correlation between gender of actors and popularity we need to define what a 'popular' movie is. This can be done one multiple metrics.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df_movies = pd.read_csv('movie.csv')

In [None]:
# For now we clean by dropping NA's
df_movies = df_movies.dropna()
df_movies.shape


In [None]:
# Fixing trailing characters
df_movies['movie_title'] = df_movies.movie_title.str.replace('[^\x00-\x7F]','')

In [None]:
# A credits dataset we can join with our movie dataset. In order to get the actor's genders
df_credits = load_tmdb_credits('tmdb_5000_credits.csv')
df_credits = df_credits.rename(columns={'title': 'movie_title'})


Both datasets come from the same source: The Movie Database. So we'll join them on the title.

In [None]:
# Joining the two datasets
movie_with_cast = pd.merge(df_movies, df_credits, how="inner", on="movie_title")

In [None]:
# cast is a nested field, this function will return the gender for the given cast and name.
def actor_to_gender(cast, name):
    for actor in cast:
        if name == actor['name']:
            return actor['gender']
    return 0
 
movie_with_cast['actor_1_gender'] = movie_with_cast.apply(lambda movie: actor_to_gender(movie.cast, movie.actor_1_name), axis=1)
movie_with_cast['actor_2_gender'] = movie_with_cast.apply(lambda movie: actor_to_gender(movie.cast, movie.actor_2_name), axis=1)
movie_with_cast['actor_3_gender'] = movie_with_cast.apply(lambda movie: actor_to_gender(movie.cast, movie.actor_3_name), axis=1)


In [None]:
movie_with_cast.actor_1_gender.value_counts().plot(kind='pie', labels=['Male', 'Female', 'Unknown'])
plt.show();

movie_with_cast.actor_2_gender.value_counts().plot(kind='pie', labels=['Male', 'Female', 'Unknown'])
plt.show();

movie_with_cast.actor_3_gender.value_counts().plot(kind='pie', labels=['Male', 'Female', 'Unknown'])
plt.show();