In [1]:
library(tidyverse)

-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.2.1     [32mv[39m [34mpurrr  [39m 0.3.3
[32mv[39m [34mtibble [39m 2.1.3     [32mv[39m [34mdplyr  [39m 0.8.3
[32mv[39m [34mtidyr  [39m 1.0.0     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.3.1     [32mv[39m [34mforcats[39m 0.4.0

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [4]:
movies_df <- read_csv('../data/clean/movies_clean_df.csv', col_types = cols(X1 = col_skip()))

"Missing column names filled in: 'X1' [1]"


In [7]:
movies_df %>% 
    str()

Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame':	1414 obs. of  7 variables:
 $ Title         : chr  "Twelve Monkeys" "24 7: Twenty Four Seven" "Twin Falls Idaho" "Four Rooms" ...
 $ Major_Genre   : chr  "Drama" "Comedy" "Drama" "Comedy" ...
 $ Director      : chr  "Terry Gilliam" "Shane Meadows" "Michael Polish" "Robert Rodriguez" ...
 $ Year          : num  1995 1998 1999 1995 1994 ...
 $ Profit_Million: num  139.841 -1.927 0.527 0.301 238.396 ...
 $ IMDB_Rating   : num  8.1 6.9 7.1 6.4 7.1 7.6 6.6 5.6 5.9 3.2 ...
 $ MPAA_Rating   : chr  "R" "R" "R" "R" ...
 - attr(*, "spec")=
  .. cols(
  ..   X1 = col_skip(),
  ..   Title = [31mcol_character()[39m,
  ..   Major_Genre = [31mcol_character()[39m,
  ..   Director = [31mcol_character()[39m,
  ..   Year = [32mcol_double()[39m,
  ..   Profit_Million = [32mcol_double()[39m,
  ..   IMDB_Rating = [32mcol_double()[39m,
  ..   MPAA_Rating = [31mcol_character()[39m
  .. )


In [41]:
#' 
#' Finds the number of movies of the most productive directors in the selected genre.
#' 
#' @param df data frame the data frame to work on
#' @param num int the number of directors to keep in each genre
#' @param genre string the selected genre
#' 
#' @return a data frame only contains movie information from the most productive 
#' directors in the selected genre  
#'
get_top_director <- function(df, num, genre){

    df %>% 
        filter(Major_Genre == genre) %>%
        group_by(Director) %>%
        summarise(Count = n()) %>%
        arrange(desc(Count)) %>%
        mutate(Major_Genre = genre) %>%
        head(30)

}
top_director <- get_top_director(movies_df, 30, 'Action')

top_df <- movies_df %>% 
    inner_join(top_director, by = c("Major_Genre", "Director"))

In [43]:
top_director

Director,Count,Major_Genre
<chr>,<int>,<chr>
Michael Bay,7,Action
Brett Ratner,6,Action
Tony Scott,6,Action
James Cameron,5,Action
Andy Wachowski,4,Action
John Woo,4,Action
Renny Harlin,4,Action
Richard Donner,4,Action
Rob Cohen,4,Action
Steven Spielberg,4,Action


In [42]:
top_df

Title,Major_Genre,Director,Year,Profit_Million,IMDB_Rating,MPAA_Rating,Count
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<int>
The Abyss,Action,James Cameron,1989,-15.756875,7.6,PG-13,5
Bad Boys,Action,Michael Bay,1995,118.247413,6.6,R,7
Beverly Hills Cop II,Action,Tony Scott,1987,256.665036,6.1,R,6
Broken Arrow,Action,John Woo,1996,83.345997,5.8,R,4
Batman Forever,Action,Joel Schumacher,1995,236.529144,5.4,PG-13,3
Chain Reaction,Action,Andrew Davis,1996,5.209334,5.2,PG-13,2
Crimson Tide,Action,Tony Scott,1995,104.387195,7.2,R,6
Desperado,Action,Robert Rodriguez,1995,18.532388,7.0,R,3
Daylight,Action,Rob Cohen,1996,78.908290,5.4,PG-13,4
Die Hard: With a Vengeance,Action,John McTiernan,1995,274.480746,7.4,R,3
