In [None]:
import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np
import pandas as pd
import geodatasets
from matplotlib import colormaps
from geopandas import GeoDataFrame
import folium
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes

gdf = gpd.read_file('./ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp')

italy = gdf[gdf['NAME'] == 'Italy']

movies_df = pd.read_csv('../../dataset/cleaned/movies.csv').set_index('id')
countries_df = pd.read_csv('../../dataset/cleaned/countries.csv').set_index('id')
languages_df = pd.read_csv('../../dataset/cleaned/languages.csv').set_index('id')
genres_df = pd.read_csv('../../dataset/cleaned/genres.csv').set_index('id')
actors_df = pd.read_csv('../../dataset/cleaned/actors.csv').set_index('id')
crew_df = pd.read_csv('../../dataset/cleaned/crew.csv').set_index('id')



In [None]:
gdf

replacing country name to allow perfect matching

In [None]:
countries_df.replace("UK", "United Kingdom", inplace=True)
countries_df.replace("USA", "United States of America", inplace=True)
countries_df.replace("Russian Federation", "Russia", inplace=True)
countries_df.replace("Democratic Republic of Congo", "Dem. Rep. Congo", inplace=True)
countries_df.replace("Bolivarian Republic of Venezuela", "Venezuela", inplace=True)
countries_df.replace("South Sudan", "S. Sudan", inplace=True)
countries_df.replace("Central African Republic", "Central African Rep.", inplace=True)
countries_df.replace("Ivory Coast", "Côte d'Ivoire", inplace=True)
countries_df.replace("United Republic of Tanzania", "Tanzania", inplace=True)
countries_df.replace("Eritrea", "Eritrea", inplace=True)
countries_df.replace("Bosnia and Herzegovina", "Bosnia and Herz.", inplace=True)
countries_df.replace("Republic of Moldova", "Moldova", inplace=True)
countries_df.replace("Syrian Arab Republic", "Syria", inplace=True)
countries_df.replace("Lao People's Democratic Republic", "Laos", inplace=True)
countries_df.replace("Dominican Republic", "Dominican Rep.", inplace=True)
countries_df

In [None]:
movies_by_country = countries_df.groupby('country').size().reset_index(name='movie_counts')

In [None]:
movies_by_country_map = pd.merge(gdf, movies_by_country, left_on='NAME', right_on='country', how='inner')

In [None]:
movies_by_country.sort_values('movie_counts', ascending=False)

Movie produced by Country
###### It is possible to see that almost every country of the word produced less than 17000 movies. The higher movie counts are from countries that have a more developed movie production culture line USA, France, UK and India

In [None]:
movies_by_country_map.plot(column='movie_counts',legend=True, cmap='tab20', edgecolor='black', linewidth=1, legend_kwds={'label':'Movie Counts'}, figsize=(20,10))
plt.title('Movie By Country')
plt.show()

Focus on Europe

In [None]:
europe = movies_by_country_map[(movies_by_country_map['SUBREGION'] == 'Southern Europe') | (movies_by_country_map['SUBREGION'] == 'Northern Europe') | (movies_by_country_map['SUBREGION'] == 'Eastern Europe') | (movies_by_country_map['SUBREGION'] == 'Western Europe')]
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_xlim(-30, 50)
ax.set_ylim(25, 75)
europe.plot(column='movie_counts',legend=True, cmap='tab20', edgecolor='black',figsize=(20,10), linewidth=1, ax = ax, legend_kwds={'label':'Movie Counts'})
plt.title('Movie in Europe')
plt.show()

Focus on North america

In [None]:
america = movies_by_country_map[movies_by_country_map['CONTINENT'] == 'North America']
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_xlim(-150, -50)
ax.set_ylim(20, 75)
america.plot(column='movie_counts',legend=True, cmap='tab20', edgecolor='black',figsize=(20,10), linewidth=1, ax = ax, legend_kwds={'label':'Movie Counts'})
plt.title('Movie in North America')
plt.show()

In [None]:
movies_by_country_map.explore(column='movie_counts',legend=True, cmap='turbo', edgecolor='black', linewidth=1, legend_kwds={'label':'Movie Counts'},tooltip=False)

In [None]:
movies_genres_country = movies_df.join(genres_df).join(countries_df)
movies_genres_country = movies_genres_country.groupby(['country','genre']).size().reset_index(name='movie_counts')
movies_genres_country

In [None]:
movies_genres_country_map = pd.merge(gdf, movies_genres_country, left_on='NAME', right_on='country', how='inner')
movies_genres_country_map = movies_genres_country_map[~movies_genres_country_map.duplicated(subset=['country','genre'])]
m = folium.Map(location=[0, 0], zoom_start=4)
colors=['red','green','blue','gray','purple','brown']
for id, row in movies_genres_country_map.iterrows():
    folium.CircleMarker(
        location=[row['LABEL_Y'], row['LABEL_X']],
        radius=row['movie_counts']/150,
        popup=f"{row['genre']}",
        fill=False,
        color=colors[id%len(colors)],
    ).add_to(m)

m

### Number of movies in relation with estimated population

###### it is possible to see that some countries underperformed the movie production over the years. This could be caused by a number of reasons like poor economy. This trend is mainly localized in africa, india, china, middle east and sout east of asia  (high number of movies = low ratio).

In [None]:
movies_by_country_map['movie_population'] = movies_by_country_map['POP_EST']/movies_by_country_map['movie_counts']
movies_by_country_map.plot(column='movie_population',legend=True, cmap='tab20', edgecolor='black', linewidth=1, legend_kwds={'label':'movie_population'}, figsize=(20,10))
plt.title('Movie By Country in relation with estimated population')
plt.show()

### Number of movies in relation with gdp (2019)

###### It is possible to see that the movies production is not dependent on the gpd in every situation. We can see for example that China produced fewer movies than Italy despite the incredible difference in gdp. African countries has a low ratio that means that they produced a good amount of movies in relation with the gpd. Movie production in the most developed countries in terms of gdp perform as expected.

In [None]:
movies_2019 = movies_df[movies_df['date'] == 2019].copy()

movies_2019_country = movies_df.join(countries_df)

movies_2019_by_country = movies_2019_country.groupby('country').size().reset_index(name='movie_counts')

movies_2019_by_country_map = pd.merge(gdf, movies_2019_by_country, left_on='NAME', right_on='country', how='inner')

movies_2019_by_country_map['movie_gdp'] = movies_2019_by_country_map['GDP_MD']/movies_2019_by_country_map['movie_counts']
movies_2019_by_country_map.plot(column='movie_gdp',legend=True, cmap='tab20', edgecolor='black', linewidth=1, legend_kwds={'label':'movie_population'}, figsize=(20,10))
plt.title('Movie By Country in relation with Gross domestic product')
plt.show()
#(high number of movies = low ratio)

### avg number of actors in a movie produced in a certain country

###### the map shows that the countries with a higher movie production culture hired a grater amount of actors for their movies. As expected US, Canada, Western Europe, India, Australia perform over the average, Africa, Middle East, China perform below the average

In [None]:
movies_actors = movies_df.join(actors_df, lsuffix="_movies", rsuffix="_actors")

actors_count = movies_actors.groupby('id').size().reset_index(name='actors_counts')

movies_actors = movies_df.merge(actors_count, on='id')

movies_actors_country = movies_actors.merge(countries_df,on='id')

movies_actors_country = movies_actors_country[['actors_counts','country']]

movies_actors_means = movies_actors_country.groupby('country').mean()

movies_by_country_map = pd.merge(gdf, movies_actors_means, left_on='NAME', right_on='country', how='inner')

movies_by_country_map.plot(column='actors_counts',legend=True, cmap='RdBu', edgecolor='black', linewidth=1, legend_kwds={'label':'avg number of actors'}, figsize=(20,10))
plt.title('Average number of actors per film per country')
plt.show()

movies_actors_country.groupby('country').mean().mean()


### Avg number of crew that work for a movie for each country

###### the map shows that almost every country defined an average of less than 10 crew members for each movie. Some outliers countries have a really high number of crew members but this could be due to a low number of movies. Most of the countries perform around the average but some of them has greater number like US, Canada, Italy, Australia, South East Asia

In [None]:
movies_crew = movies_df.join(crew_df, lsuffix="_movies", rsuffix="_crew")

crew_count = movies_crew.groupby('id').size().reset_index(name='crew_counts')

movies_crew = movies_df.merge(crew_count, on='id')

movies_crew_country = movies_crew.merge(countries_df,on='id')

movies_crew_country = movies_crew_country[['crew_counts','country']]

movies_crew_means = movies_crew_country.groupby('country').mean()

movies_by_country_map = pd.merge(gdf, movies_crew_means, left_on='NAME', right_on='country', how='inner')

movies_by_country_map.plot(column='crew_counts',legend=True, cmap='seismic', edgecolor='black', linewidth=1, legend_kwds={'label':'avg number of crew'}, figsize=(20,10))
plt.title('Average number of crew per film per country')
plt.show()

movies_crew_country.groupby('country').mean().mean()