In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 99)

In [None]:
titles = pd.read_csv('titles.csv')

# Strategy:
 ### cleaning: 
- throw out all irrelevant columns (see '[df name]Drop' lists)
- throw out movies that aren't action/adventure
- remove the 'min' in the duration column to make the value an integer
- merge ratings data into one data frame, allowing movies to have raitings from either 1 or both sources
- merge this to the list of movies, but this time throw out movies that don't have a rating
    - justification for throwing out ratingless movies: if they are'nt rated, they're probably un-remarkable

- split the 'cast' category, creating identical columns for each movie, one for each of the actors listed in the cast
    - This will make it easy for us to use the 'group by' function

In [None]:
movies = titles[titles['type']=='Movie']
moviesDrop = ['show_id','rating','director','country','date_added','description','type']
movies.drop(axis=1,labels=moviesDrop,inplace=True)
movies = movies[movies['cast'].notna()]

In [None]:
#filter for only 'action and adventure'
movies = movies[movies['listed_in'].apply(lambda x: 'Action & Adventure' in str(x.split(',')))]

In [None]:
ratings = pd.read_csv('movies_metadata.csv')
ratingsDrop = ['adult','popularity','belongs_to_collection','budget','revenue','genres','homepage','spoken_languages','id','imdb_id','original_language','original_title','overview','poster_path','production_companies','production_countries','release_date','runtime','status','tagline','video']
ratings.drop(axis=1,labels=ratingsDrop,inplace=True)
#if either the number of ratings or the rating itself is null, set both values to null
ratings.loc[ratings['vote_average'].isnull(), 'vote_count'] = np.nan
ratings.loc[ratings['vote_count'].isnull(), 'vote_average'] = np.nan

In [None]:
ratings2 = pd.read_csv('ratings.csv')
ratingsDrop = ['rating','ratingLevel','release year']
ratings2.drop(axis=1,labels=ratingsDrop,inplace=True)
#if either the number of ratings or the rating itself is null, set both values to null
ratings2.loc[ratings2['user rating score'].isnull(), 'user rating size'] = np.nan
ratings2.loc[ratings2['user rating size'].isnull(), 'user rating score'] = np.nan

In [None]:
#turn the duration into an integer to represent minutes
movies['duration'] = movies['duration'].apply(lambda x:int(x.split()[0]))

In [None]:
#merge the ratings together and allign them with the titles
movies = pd.merge(movies,pd.merge(ratings,ratings2,on='title',how='outer'),on='title').drop_duplicates(subset='title', keep="first")

In [None]:
movies['vote_average'] = movies['vote_average'].fillna(0)
movies['vote_count'] = movies['vote_count'].fillna(0)
movies['user rating score'] = movies['user rating score'].fillna(0)
movies['user rating size'] = movies['user rating size'].fillna(0)
movies['vote_average'] = movies['vote_average']*10 #set both scales to be out of 100
movies['count'] = 1 #used to find the number of movies actors have been in

In [None]:
#keep movies that have ratings from one or the other or both rating files
movies = movies[(movies['vote_count']!=0) | (movies['user rating size'] !=0)]
movies.reset_index(inplace=True, drop=True)

In [None]:
#split up each actor in the 'cast' to have their own row with the movie information
stack = movies['cast'].str.split(',').apply(pd.Series,1).stack() 
stack.index = stack.index.droplevel(-1)#line up with movies index
stack.name = 'cast'
del movies['cast'] #delete old list version
movies = movies.join(stack) #replace with indivdualized cast members

### Analysis:
- average score data from both rating sources into a single column for all movies
- throw out rows that have less than 45 (half the median) total votes
    - justification: executives want reliable results and the uncertainty in small samples weakens the validity
- groupby('cast') and extract the average rating as well as the number of movies stared in

In [None]:
movies = movies[(movies['user rating size'] + movies['vote_count']) > 45]

In [None]:
#combine both ratings into a weighted average of the two rating systems
movies['rating']=(movies['vote_average']*movies['vote_count']+movies['user rating score']*movies['user rating size'])/(movies['vote_count']+movies['user rating size'])

In [None]:
movies

In [None]:
#compile list of actors, and add the average score and sum of runtime and number of movies
actors = movies.groupby('cast').mean().sort_values('rating',ascending = False)
actors.drop(axis=1, labels=['duration','count','release_year','vote_average','vote_count','user rating score','user rating size'],inplace=True)
actors = pd.merge(actors,movies.groupby('cast').sum()[['duration','count']],on='cast')
actors.columns=['average rating','total length of movie runtimes','number of movies stared in']
actors.reset_index(inplace=True)

In [None]:
actors.to_csv('A&A actor ratings.csv',index=False)

In [None]:
actors