-
Notifications
You must be signed in to change notification settings - Fork 0
/
Prediction of movies popularity.Rmd
276 lines (229 loc) · 10 KB
/
Prediction of movies popularity.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
---
title: "Prediction of movies popularity"
author: "Saurabh Sankhe"
date: "February 22, 2019"
output:
word_document: default
html_document: default
---
#The purpose of this project is to develop mutliple linear regression model to analyze the factors that will make a movie popular. The dataset contains the information that are extracted from IMDB for random sample movies. For popularity we are going to measure the audience_score as an output variable and the attributes will be the type of movie, genre, runtime, imdb rating, imdb number of votes, critics rating, critics score, audience rating, Oscar awards obtained (actor, actress, director and picture).
#if all these attributes are related significantly then we can find the popularity of movie.
#Load packages
```{r}
library(ggplot2)
library(dplyr)
library(statsr)
library(gridExtra)
library(corrplot)
```
#Load the data
```{r}
mydata <- load("C:/Users/Saurabh/Desktop/Sem-2 Course Documents/Multivariate Analysis/Movies/movies.RData")
```
```{r}
movies_new <- movies %>% select(title, title_type, genre, runtime, imdb_rating, imdb_num_votes, critics_rating, critics_score, audience_rating, audience_score, best_pic_win, best_actor_win, best_actress_win, best_dir_win)
str(movies_new)
movies_new[c(2,3,7,9,11:14)] <- lapply(movies_new[c(2,3,7,9,11:14)], as.numeric)
movies_data <- movies_new
movies_data <- movies_data %>% select(title_type, genre, runtime, imdb_rating, imdb_num_votes, critics_rating, critics_score, audience_rating,best_pic_win, best_actor_win, best_actress_win, best_dir_win)
```
```{r}
summary(movies_new)
View(movies_new)
```
#Drop missing value
```{r}
movies_new <- na.omit(movies_new)
```
Split data into train and test
```{r}
set.seed(2017)
split <- sample(seq_len(nrow(movies_new)), size = floor(0.999 * nrow(movies_new)))
train <- movies_new[split, ]
test <- movies_new[-split, ]
```
#histogram
```{r}
colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
hist(train$audience_score, col=colors, main = "Histogram for Train score")
summary(train$audience_score)
```
#The median of our response variable - audience score distribution is 65; 75% of the movie in the training set have an audience score higher than 80; 25% of the movie in the training set have an audience score lower than 46; very few movie have an audience score lower than 20 or higher than 90
```{r}
p1 <- ggplot(aes(x=runtime), data=train) +
geom_histogram(aes(y=100*(..count..)/sum(..count..)), color='black', fill='white', binwidth = 5) + ylab('percentage') + ggtitle('Run Time')
p2 <- ggplot(aes(x=imdb_rating), data=train) +
geom_histogram(aes(y=100*(..count..)/sum(..count..)), color='black', fill='white', binwidth = 0.2) + ylab('percentage') + ggtitle('IMDB rating')
p3 <- ggplot(aes(x=log10(imdb_num_votes)), data=train) +
geom_histogram(aes(y=100*(..count..)/sum(..count..)), color='black', fill='white') + ylab('percentage') + ggtitle('log(IMDB number of votes)')
p4 <- ggplot(aes(x=critics_score), data=train) +
geom_histogram(aes(y=100*(..count..)/sum(..count..)), color='black', fill='white', binwidth = 2) + ylab('percentage') + ggtitle('Critics Score')
grid.arrange(p1, p2, p3, p4, ncol=2)
```
#Regression analysis: Run time, IMDB rating, log(IMDB number of votes) and Critics Scores all have reasonable broad distribution, therefore, they will be considered for the regression analysis.
```{r}
p1 <- ggplot(aes(x=title_type), data=train) + geom_bar(aes(y=100*(..count..)/sum(..count..))) + ylab('percentage') +
ggtitle('Title Type') + coord_flip()
p2 <- ggplot(aes(x=genre), data=train) + geom_bar(aes(y=100*(..count..)/sum(..count..))) + ylab('percentage') +
ggtitle('Genre') + coord_flip()
p3 <- ggplot(aes(x=critics_rating), data=train) + geom_bar(aes(y=100*(..count..)/sum(..count..))) + ylab('percentage') +
ggtitle('Critics Rating') + coord_flip()
p4 <- ggplot(aes(x=audience_rating), data=train) + geom_bar(aes(y=100*(..count..)/sum(..count..))) + ylab('percentage') +
ggtitle('Audience Rating') + coord_flip()
grid.arrange(p1, p2, p3, p4, ncol=2)
```
#Most movies in the data are in the "Feature Film" title type and majority of the movies are drama. Therefore, we must be aware that the results could be biased toward drama movies.
```{r}
vars <- names(train) %in% c('runtime', 'imdb_rating', 'imdb_num_votes', 'critics_score')
selected_train <- train[vars]
corr.matrix <- cor(selected_train)
corrplot(corr.matrix, main="\n\nCorrelation Plot of numerical variables", method="number")
```
```{r}
boxplot(audience_score~critics_rating, data=train, main='Audience score vs. Critics rating', xlab='Critics Rating', ylab='Audience Score')
by(train$audience_score, train$critics_rating, summary)
boxplot(audience_score~audience_rating, data=train, main='Audience Score vs. Audience Rating', xlab='Audience rating', ylab='Audience Score')
by(train$audience_score, train$audience_rating, summary)
boxplot(audience_score~title_type, data=train, main='Audience score vs. Title type', xlab='Title_type', ylab='Audience Score')
by(train$audience_score, train$title_type, summary)
boxplot(audience_score~genre, data=train, main='Audience score vs. Genre', xlab='Genre', ylab='Audience score')
by(train$audience_score, train$genre, summary)
```
#All the categorical variables seems to have reasonable significant correlation with audience score.
```{r}
x <- c(movies_new$imdb_num_votes,movies_new$best_pic_win,movies_new$best_actor_win,movies_new$best_actress_win,movies_new$best_dir_win)
t.test(movies_new$audience_score, x)
```
```{r}
movies_new <- cor(movies_new[2:14])
movies_pca <- prcomp(movies_new,scale=TRUE)
str(movies_new)
summary(movies_pca)
#movies_pca$x
movies_pca$rotation
print(movies_pca)
```
```{r}
plot(movies_pca, type='l')
```
```{r}
(movies_pca_eigens <- movies_pca$sdev^2)
names(movies_pca_eigens) <- paste("PC",1:8,sep="")
sumlambdas <- sum(movies_pca_eigens)
sumlambdas
dim(movies_new)
```
```{r}
#corr.matrix
movies_pca_new <- prcomp(corr.matrix, scale = TRUE)
summary(movies_pca_new)
movies_pca_new$rotation
print(movies_pca_new)
```
```{r}
plot(movies_pca_new, type='l')
(movies_pca_eigens_new <- movies_pca_new$sdev^2)
names(movies_pca_eigens_new) <- paste("PC",1:2,sep="")
sumlambdas <- sum(movies_pca_eigens_new)
sumlambdas
dim(corr.matrix)
```
```{r}
```
```{r}
colnames(movies_new) <- rownames(movies_new)
movies_new <- as.dist(movies_new)
mat5.nn <- hclust(movies_new, method = "single")
plot(mat5.nn, hang=-1,xlab="Object",ylab="Distance",
main="Dendrogram. Nearest neighbor linkage")
#Default - Complete
mat5.fn <- hclust(movies_new)
plot(mat5.fn,hang=-1,xlab="Object",ylab="Distance",
main="Dendrogram. Farthest neighbor linkage")
#Average
mat5.avl <- hclust(movies_new,method="average")
plot(mat5.avl,hang=-1,xlab="Object",ylab="Distance",
main="Dendrogram. Group average linkage")
```
```{r}
# Standardizing the data with scale()
matstd.movies_new <- scale(movies_new[2:14])
# K-means, k=2, 3, 4, 5, 6
# Centers (k's) are numbers thus, 10 random sets are chosen
(kmeans2.movies_new <- kmeans(matstd.movies_new,2,nstart = 10))
# Computing the percentage of variation accounted for. Two clusters
perc.var.2 <- round(100*(1 - kmeans2.movies_new$betweenss/kmeans2.movies_new$totss),1)
names(perc.var.2) <- "Perc. 2 clus"
perc.var.2
# Computing the percentage of variation accounted for. Three clusters
(kmeans3.movies_new <- kmeans(matstd.movies_new,3,nstart = 10))
perc.var.3 <- round(100*(1 - kmeans3.movies_new$betweenss/kmeans3.movies_new$totss),1)
names(perc.var.3) <- "Perc. 3 clus"
perc.var.3
# Computing the percentage of variation accounted for. Four clusters
(kmeans4.movies_new <- kmeans(matstd.movies_new,4,nstart = 10))
perc.var.4 <- round(100*(1 - kmeans4.movies_new$betweenss/kmeans4.movies_new$totss),1)
names(perc.var.4) <- "Perc. 4 clus"
perc.var.4
# Computing the percentage of variation accounted for. Five clusters
(kmeans5.movies_new <- kmeans(matstd.movies_new,5,nstart = 10))
perc.var.5 <- round(100*(1 - kmeans5.movies_new$betweenss/kmeans5.movies_new$totss),1)
names(perc.var.5) <- "Perc. 5 clus"
perc.var.5
(kmeans6.movies_new <- kmeans(matstd.movies_new,6,nstart = 10))
# Computing the percentage of variation accounted for. Six clusters
perc.var.6 <- round(100*(1 - kmeans6.movies_new$betweenss/kmeans6.movies_new$totss),1)
names(perc.var.6) <- "Perc. 6 clus"
perc.var.6
#
movies_new <- scale(movies_new)
wss <- (nrow(movies_new)-1)*sum(apply(movies_new,2,var))
for (i in 1:5) wss[i] <- sum(kmeans(movies_new,centers=i)$withinss)
fit <- kmeans(movies_new, 5)
aggregate(movies_new,by=list(fit$cluster),FUN=mean)
mydata <- data.frame(movies_new, fit$cluster)
mydata
d <- dist(mydata, method = "euclidean") # distance matrix
fit <- hclust(d, method="complete")
plot(fit)
# cut tree into 5 clusters
groups <- cutree(fit, k=5)
# draw dendogram with red borders around the 5 clusters
rect.hclust(fit, k=5, border="red")
```
Factor Analysis
```{r}
head(movies_data)
#Loading the required library
library(psych)
#Applying Factor Analysis on the data with 4 factors
fit_pc <- principal(movies_data,nfactors = 4, rotate = "varimax")
#Printing the results of Factor Analysis
fit_pc
#rounding the values to 3 decimal places
round(fit.pc$values, 3)
#Printing the loading data to console for the
fit.pc$loadings
```
Now we look at the cummunality
```{r}
fit.pc$communality
#Printing the scores
fit.pc$scores
# See Correlations within Factors
fa.plot(fit.pc)
#Visualize the relationship
fa.diagram(fit.pc)
```
```{r}
fit1 <- lm(audience_score~., data = train[,-1])
g1 <- step(fit1)
library(car)
compareCoefs(fit1,g1,se=FALSE)
fit_final <- lm(audience_score ~ genre+runtime+imdb_rating+critics_score+audience_rating, data=train[,-1])
summary(fit_final)
newmovie <- test %>% select(genre, imdb_rating, audience_rating,critics_score,runtime)
predict(fit_final, newmovie)
predict(fit_final, newmovie, interval = "prediction", level = 0.95)
test$audience_score
```