## Movie rating and ranking

### 2.1 Load Data

In [1]:
# Load packages
library(ggplot2) # visualization
library(ggrepel)
#library(ggthemes) # visualization
library(scales) # visualization
library(dplyr) # data manipulation
#library(VIM)
library(data.table)
#library(formattable)
library(plotly)
#library(corrplot)
#library(GGally)
library(caret)
library(car)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last



Attaching package: ‘plotly’


The following object is masked from ‘package:ggplot2’:

    last_plot


The following object is masked from ‘package:stats’:

    filter


The following object is masked from ‘package:graphics’:

    layout


Loading required package: lattice

Loading required package: carData


Attaching package: ‘car’


The following object is masked from ‘package:dplyr’:

    recode




In [2]:
IMDB <- read.csv("movie_metadata.csv")
str(IMDB)

'data.frame':	5043 obs. of  28 variables:
 $ color                    : chr  "Color" "Color" "Color" "Color" ...
 $ director_name            : chr  "James Cameron" "Gore Verbinski" "Sam Mendes" "Christopher Nolan" ...
 $ num_critic_for_reviews   : int  723 302 602 813 NA 462 392 324 635 375 ...
 $ duration                 : int  178 169 148 164 NA 132 156 100 141 153 ...
 $ director_facebook_likes  : int  0 563 0 22000 131 475 0 15 0 282 ...
 $ actor_3_facebook_likes   : int  855 1000 161 23000 NA 530 4000 284 19000 10000 ...
 $ actor_2_name             : chr  "Joel David Moore" "Orlando Bloom" "Rory Kinnear" "Christian Bale" ...
 $ actor_1_facebook_likes   : int  1000 40000 11000 27000 131 640 24000 799 26000 25000 ...
 $ gross                    : int  760505847 309404152 200074175 448130642 NA 73058679 336530303 200807262 458991599 301956980 ...
 $ genres                   : chr  "Action|Adventure|Fantasy|Sci-Fi" "Action|Adventure|Fantasy" "Action|Adventure|Thriller" "Action|Thrille

### 2.2 Remove Duplicates

In [3]:
# duplicate rows
sum(duplicated(IMDB))

In [4]:
# delete duplicate rows
IMDB <- IMDB[!duplicated(IMDB), ]

### 2.3 Tidy Up Movie Title

In [5]:
library(stringr)
IMDB$movie_title <- gsub("Â", "", as.character(factor(IMDB$movie_title)))
str_trim(IMDB$movie_title, side = "right") #Side on which to remove whitespace

### 2.4 Split Genres

In [6]:
head(IMDB$genres)

In [7]:
# divide the string into several substrings by the separator ‘|’
# and save each substring along with its correspongding imdb score in the other data frame genres.df.

# create a new data frame
genres.df <- as.data.frame(IMDB[,c("genres", "imdb_score")])

In [8]:
# separate different genres into new columns
genres.df$Action <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Action") 1 else 0)
genres.df$Adventure <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Adventure") 1 else 0)
genres.df$Animation <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Animation") 1 else 0)
genres.df$Biography <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Biography") 1 else 0)
genres.df$Comedy <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Comedy") 1 else 0)
genres.df$Crime <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Crime") 1 else 0)
genres.df$Documentary <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Documentary") 1 else 0)
genres.df$Drama <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Drama") 1 else 0)
genres.df$Family <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Family") 1 else 0)
genres.df$Fantasy <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Fantasy") 1 else 0)
genres.df$'Film-Noir' <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Film-Noir") 1 else 0)
genres.df$History <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "History") 1 else 0)
genres.df$Horror <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Horror") 1 else 0)
genres.df$Musical <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Musical") 1 else 0)
genres.df$Mystery <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Mystery") 1 else 0)
genres.df$News <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "News") 1 else 0)
genres.df$Romance <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Romance") 1 else 0)
genres.df$'Sci-Fi' <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Sci-Fi") 1 else 0)
genres.df$Short <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Short") 1 else 0)
genres.df$Sport <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Sport") 1 else 0)
genres.df$Thriller <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Thriller") 1 else 0)
genres.df$War <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "War") 1 else 0)
genres.df$Western <- sapply(1:length(genres.df$genres), function(x) if (genres.df[x,1] %like% "Western") 1 else 0)

In [9]:
genres.df

Unnamed: 0_level_0,genres,imdb_score,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,⋯,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Action|Adventure|Fantasy|Sci-Fi,7.9,1,1,0,0,0,0,0,0,⋯,0,0,0,0,1,0,0,0,0,0
2,Action|Adventure|Fantasy,7.1,1,1,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
3,Action|Adventure|Thriller,6.8,1,1,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,1,0,0
4,Action|Thriller,8.5,1,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,1,0,0
5,Documentary,7.1,0,0,0,0,0,0,1,0,⋯,0,0,0,0,0,0,0,0,0,0
6,Action|Adventure|Sci-Fi,6.6,1,1,0,0,0,0,0,0,⋯,0,0,0,0,1,0,0,0,0,0
7,Action|Adventure|Romance,6.2,1,1,0,0,0,0,0,0,⋯,0,0,0,1,0,0,0,0,0,0
8,Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance,7.8,0,1,1,0,1,0,0,0,⋯,1,0,0,1,0,0,0,0,0,0
9,Action|Adventure|Sci-Fi,7.5,1,1,0,0,0,0,0,0,⋯,0,0,0,0,1,0,0,0,0,0
10,Adventure|Family|Fantasy|Mystery,7.5,0,1,0,0,0,0,0,0,⋯,0,1,0,0,0,0,0,0,0,0


In [10]:
# get the mean of imdb score for different genres
means <- rep(0,23)
for (i in 1:23) {
  means[i] <- mean(genres.df$imdb_score[genres.df[i+2]==1])
}
# plot the means
barplot(means, main = "Average imdb scores for different genres"

ERROR: Error in parse(text = x, srcfile = src): <text>:8:0: unexpected end of input
6: # plot the means
7: barplot(means, main = "Average imdb scores for different genres"
  ^


In [11]:
#remove the predictor "genres" because there isn’t much difference in the averages of imdb score related to different genres

IMDB <- subset(IMDB, select = -c(genres))

## 3 Data Cleaning

### 3.1 Missing Values

In [12]:
colSums(sapply(IMDB, is.na))

In [13]:
missing.values <- aggr(IMDB, sortVars = T, prop = T, sortCombs = T, cex.lab = 1.5, cex.axis = .6, cex.numbers = 5, combined = F, gap = -.2)

ERROR: Error in aggr(IMDB, sortVars = T, prop = T, sortCombs = T, cex.lab = 1.5, : could not find function "aggr"


#### 3.1.1 Delete some rows

In [14]:
IMDB <- IMDB[!is.na(IMDB$gross), ]
IMDB <- IMDB[!is.na(IMDB$budget), ]
dim(IMDB) #omitted 23% of the observations

In [15]:
sum(complete.cases(IMDB)) #there are still 3857 - 3768 = 89 rows with NAs.

#### 3.1.2 Analyze aspect ratio

In [16]:
colSums(sapply(IMDB, is.na))

In [17]:
table(IMDB$aspect_ratio)


1.18 1.33 1.37  1.5 1.66 1.75 1.77 1.78 1.85    2  2.2 2.24 2.35 2.39  2.4 2.55 
   1   19   50    1   40    2    1   41 1600    3   10    1 1995   11    3    1 
2.76   16 
   3    1 

In [18]:
IMDB$aspect_ratio[is.na(IMDB$aspect_ratio)] <- 0
mean(IMDB$imdb_score[IMDB$aspect_ratio == 1.85])

In [19]:
mean(IMDB$imdb_score[IMDB$aspect_ratio == 2.35])

In [20]:
mean(IMDB$imdb_score[IMDB$aspect_ratio != 1.85 & IMDB$aspect_ratio != 2.35])

In [21]:
# removing this variable since there is no significant difference from the means of imdb score for different aspect ratios
IMDB <- subset(IMDB, select = -c(aspect_ratio))

#### 3.1.3 Deal with 0s

In [22]:
# replace NA with column average for facenumber_in_poster
IMDB$facenumber_in_poster[is.na(IMDB$facenumber_in_poster)] <- round(mean(IMDB$facenumber_in_poster, na.rm = TRUE)) #NA's are removed.
# convert 0s into NAs for other predictors
IMDB[,c(5,6,8,13,24,26)][IMDB[,c(5,6,8,13,24,26)] == 0] <- NA
# impute missing value with column mean
IMDB$num_critic_for_reviews[is.na(IMDB$num_critic_for_reviews)] <- round(mean(IMDB$num_critic_for_reviews, na.rm = TRUE))
IMDB$duration[is.na(IMDB$duration)] <- round(mean(IMDB$duration, na.rm = TRUE))
IMDB$director_facebook_likes[is.na(IMDB$director_facebook_likes)] <- round(mean(IMDB$director_facebook_likes, na.rm = TRUE))
IMDB$actor_3_facebook_likes[is.na(IMDB$actor_3_facebook_likes)] <- round(mean(IMDB$actor_3_facebook_likes, na.rm = TRUE))
IMDB$actor_1_facebook_likes[is.na(IMDB$actor_1_facebook_likes)] <- round(mean(IMDB$actor_1_facebook_likes, na.rm = TRUE))
IMDB$cast_total_facebook_likes[is.na(IMDB$cast_total_facebook_likes)] <- round(mean(IMDB$cast_total_facebook_likes, na.rm = TRUE))
IMDB$actor_2_facebook_likes[is.na(IMDB$actor_2_facebook_likes)] <- round(mean(IMDB$actor_2_facebook_likes, na.rm = TRUE))
IMDB$movie_facebook_likes[is.na(IMDB$movie_facebook_likes)] <- round(mean(IMDB$movie_facebook_likes, na.rm = TRUE))

In [23]:
IMDB[,c(5,6,8,13,24,26)]

Unnamed: 0_level_0,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,actor_2_facebook_likes,movie_facebook_likes
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,949,855,1000,4834,936,33000
2,563,1000,40000,48350,5000,16720
3,949,161,11000,11700,393,85000
4,22000,23000,27000,106759,23000,164000
6,475,530,640,1873,632,24000
7,949,4000,24000,46055,11000,16720
8,15,284,799,2036,553,29000
9,949,19000,26000,92000,21000,118000
10,282,10000,25000,58753,11000,10000
11,949,2000,15000,24450,4000,197000


#### 3.1.4 Sort out content ratings

In [24]:
table(IMDB$content_rating) #there are still some missing values in content_rating, which are marked as “”.


           Approved         G        GP         M     NC-17 Not Rated    Passed 
       51        17        91         1         2         6        42         3 
       PG     PG-13         R   Unrated         X 
      573      1314      1723        24        10 

In [25]:
IMDB <- IMDB[!(IMDB$content_rating %in% ""),]

In [26]:
# replace M and GP with PG, replace X with NC-17, because these two are what we use nowadays.
IMDB$content_rating[IMDB$content_rating == 'M']   <- 'PG' 
IMDB$content_rating[IMDB$content_rating == 'GP']  <- 'PG' 
IMDB$content_rating[IMDB$content_rating == 'X']   <- 'NC-17'

### 3.2 Add columns

In [27]:
#add two colums: profit and percentage return on investment 
IMDB <- IMDB %>% 
  mutate(profit = gross - budget,
         return_on_investment_perc = (profit/budget)*100)

### 3.3 Remove Columns

#### 3.3.1 Is the color of a movie influential?

In [28]:
table(IMDB$color) #More than 96% movies are colored, which indicates that this predictor is nearly constant. Let’s remove this predictor.


                  Black and White            Color 
               2              124             3680 

In [29]:
# delete predictor color
IMDB <- subset(IMDB, select = -c(color))

#### 3.3.2 Is language an important factor for imdb score? What about country?

In [30]:
table(IMDB$language)


           Aboriginal     Arabic    Aramaic    Bosnian  Cantonese      Czech 
         2          2          1          1          1          7          1 
    Danish       Dari      Dutch    English   Filipino     French     German 
         3          2          3       3644          1         34         11 
    Hebrew      Hindi  Hungarian Indonesian    Italian   Japanese     Kazakh 
         2          5          1          2          7         10          1 
    Korean   Mandarin       Maya  Mongolian       None  Norwegian    Persian 
         5         14          1          1          1          4          3 
Portuguese   Romanian    Russian    Spanish       Thai Vietnamese       Zulu 
         5          1          1         24          3          1          1 

In [31]:
IMDB <- subset(IMDB, select = -c(language))

In [32]:
table(IMDB$country) #Around 79% movies are from USA, 8% from UK, 13% from other countries


   Afghanistan      Argentina          Aruba      Australia        Belgium 
             1              3              1             40              1 
        Brazil         Canada          Chile          China       Colombia 
             5             63              1             13              1 
Czech Republic        Denmark        Finland         France        Georgia 
             3              9              1            103              1 
       Germany         Greece      Hong Kong        Hungary        Iceland 
            79              1             13              2              1 
         India      Indonesia           Iran        Ireland         Israel 
             5              1              4              7              2 
         Italy          Japan         Mexico    Netherlands       New Line 
            11             15             10              3              1 
   New Zealand         Norway  Official site           Peru    Philippines 
           

In [33]:
# group other countries together to make this categorical variable with less levels: USA, UK, Others
levels(IMDB$country) <- c(levels(IMDB$country), "Others")
IMDB$country[(IMDB$country != 'USA')&(IMDB$country != 'UK')] <- 'Others' 
IMDB$country <- factor(IMDB$country)
table(IMDB$country)


Others     UK    USA 
   465    316   3025 