# Modeling

First let's import necessary libraries and datasets.

In [2]:
import numpy as np
import pandas as pd
import json
import ast
import datetime
import requests
import time
import regex as re
import nltk
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from bs4 import BeautifulSoup
from progressbar import ProgressBar

np.random.seed(42)

  from numpy.core.umath_tests import inner1d


In [3]:
df = pd.read_csv('../Data/Clean_movies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,belongs_to_collection,budget,genres,homepage,overview,popularity,production_companies,production_countries,release_date,...,runtime,spoken_languages,tagline,title,vote_average,vote_count,keywords,cast,crew,director
0,0,1,30000000.0,"['Animation', 'Comedy', 'Family']",1,"Led by Woody, Andy's toys live happily in his ...",21.946943,['Pixar Animation Studios'],['United States of America'],1995-10-30,...,81.0,English,,Toy Story,7.7,5415.0,"['jealousy', 'toy', 'boy', 'friendship', 'frie...","['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",John Lasseter
1,1,0,65000000.0,"['Adventure', 'Fantasy', 'Family']",0,When siblings Judy and Peter discover an encha...,17.015539,"['TriStar Pictures', 'Teitler Film', 'Intersco...",['United States of America'],1995-12-15,...,104.0,English,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"['board game', 'disappearance', ""based on chil...","['Robin Williams', 'Jonathan Hyde', 'Kirsten D...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",Joe Johnston
2,2,1,,"['Romance', 'Comedy']",0,A family wedding reignites the ancient feud be...,11.7129,"['Warner Bros.', 'Lancaster Gate']",['United States of America'],1995-12-22,...,101.0,English,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,"['fishing', 'best friend', 'duringcreditssting...","['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",Howard Deutch
3,3,0,16000000.0,"['Comedy', 'Drama', 'Romance']",0,"Cheated on, mistreated and stepped on, the wom...",3.859495,['Twentieth Century Fox Film Corporation'],['United States of America'],1995-12-22,...,127.0,English,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,"['based on novel', 'interracial relationship',...","['Whitney Houston', 'Angela Bassett', 'Loretta...","[{'credit_id': '52fe44779251416c91011acb', 'de...",Forest Whitaker
4,4,1,,['Comedy'],0,Just when George Banks has recovered from his ...,8.387519,"['Sandollar Productions', 'Touchstone Pictures']",['United States of America'],1995-02-10,...,106.0,English,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,"['baby', 'midlife crisis', 'confidence', 'agin...","['Steve Martin', 'Diane Keaton', 'Martin Short...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",Charles Shyer


Genres, production_companies, production_countries, cast and crew is the stringified JSON Object and/or listS. I will extract the information we need out of it. 

In [4]:
df['crew'] = df['crew'].fillna('[]').apply(ast.literal_eval)
df['crew'] = df['crew'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [5]:
df.release_date = pd.to_datetime(df.release_date)

In [6]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46629 entries, 0 to 46628
Data columns (total 21 columns):
Unnamed: 0               46629 non-null int64
belongs_to_collection    46629 non-null int64
budget                   8722 non-null float64
genres                   46629 non-null object
homepage                 46629 non-null int64
overview                 45634 non-null object
popularity               46625 non-null float64
production_companies     46629 non-null object
production_countries     46629 non-null object
release_date             46541 non-null datetime64[ns]
revenue                  7292 non-null float64
runtime                  44733 non-null float64
spoken_languages         42442 non-null object
tagline                  20783 non-null object
title                    46625 non-null object
vote_average             46625 non-null float64
vote_count               46625 non-null float64
keywords                 46629 non-null object
cast                     46629 non-n

Maybe will drop outliers in budget later

I will create a binary column, that indicates if the movie made any profit or not and called it success_binary.

In [7]:
df['success'] = (df['revenue'] - df['budget']) / df['budget']
df_suc = pd.DataFrame(df.dropna(subset=['success']))
df_suc['success_binary']= [1 if x > 1 else 0 for x in df_suc['success']]
df_suc['success_binary'].mean()

0.5119670905011219

In [8]:
df_suc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5348 entries, 0 to 46585
Data columns (total 23 columns):
Unnamed: 0               5348 non-null int64
belongs_to_collection    5348 non-null int64
budget                   5348 non-null float64
genres                   5348 non-null object
homepage                 5348 non-null int64
overview                 5338 non-null object
popularity               5348 non-null float64
production_companies     5348 non-null object
production_countries     5348 non-null object
release_date             5348 non-null datetime64[ns]
revenue                  5348 non-null float64
runtime                  5337 non-null float64
spoken_languages         5304 non-null object
tagline                  4614 non-null object
title                    5348 non-null object
vote_average             5348 non-null float64
vote_count               5348 non-null float64
keywords                 5348 non-null object
cast                     5348 non-null object
crew   

Maybe will drop tagline later

In [9]:
df_suc.dropna(inplace=True)
df_suc.shape

(4588, 23)

We have 4588 observations without missing values eligible for modeling.

### Let's create a features for our models

In [10]:
df_feat = df_suc[['belongs_to_collection', 'budget', 'genres', 'homepage', 'title', 'overview',
              'production_companies', 'release_date', 'tagline', 'runtime', 'keywords',
              'cast', 'crew', 'director', 'success_binary']]
df_feat.head()

Unnamed: 0,belongs_to_collection,budget,genres,homepage,title,overview,production_companies,release_date,tagline,runtime,keywords,cast,crew,director,success_binary
1,0,65000000.0,"['Adventure', 'Fantasy', 'Family']",0,Jumanji,When siblings Judy and Peter discover an encha...,"['TriStar Pictures', 'Teitler Film', 'Intersco...",1995-12-15,Roll the dice and unleash the excitement!,104.0,"['board game', 'disappearance', ""based on chil...","['Robin Williams', 'Jonathan Hyde', 'Kirsten D...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",Joe Johnston,1
3,0,16000000.0,"['Comedy', 'Drama', 'Romance']",0,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",['Twentieth Century Fox Film Corporation'],1995-12-22,Friends are the people who let you be yourself...,127.0,"['based on novel', 'interracial relationship',...","['Whitney Houston', 'Angela Bassett', 'Loretta...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...",Forest Whitaker,1
5,0,60000000.0,"['Action', 'Crime', 'Drama', 'Thriller']",0,Heat,"Obsessive master thief, Neil McCauley leads a ...","['Regency Enterprises', 'Forward Pass', 'Warne...",1995-12-15,A Los Angeles Crime Saga,170.0,"['robbery', 'detective', 'bank', 'obsession', ...","['Al Pacino', 'Robert De Niro', 'Val Kilmer', ...","[Michael Mann, Michael Mann, Art Linson, Micha...",Michael Mann,1
8,0,35000000.0,"['Action', 'Adventure', 'Thriller']",0,Sudden Death,International action superstar Jean Claude Van...,"['Universal Pictures', 'Imperial Entertainment...",1995-12-22,Terror goes into overtime.,106.0,"['terrorist', 'hostage', 'explosive', 'vice pr...","['Jean-Claude Van Damme', 'Powers Boothe', 'Do...","[Peter Hyams, Karen Elise Baldwin, Gene Quinta...",Peter Hyams,0
9,1,58000000.0,"['Adventure', 'Action', 'Thriller']",1,GoldenEye,James Bond must unmask the mysterious head of ...,"['United Artists', 'Eon Productions']",1995-11-16,No limits. No fears. No substitutes.,130.0,"['cuba', 'falsely accused', 'secret identity',...","['Pierce Brosnan', 'Sean Bean', 'Izabella Scor...","[Martin Campbell, Ian Fleming, Jeffrey Caine, ...",Martin Campbell,1


In [11]:
df_feat['cast1'] = df_feat['cast'].apply(ast.literal_eval)

df_feat['cast1'] = df_feat['cast1'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [12]:
df_feat['crew1'] = df_feat['crew'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [13]:
df_feat['genres1'] = df_feat['genres'].apply(lambda x: " ".join(x))
#df_feat['keywords1'] = df_feat['keywords'].apply(lambda x: " ".join(x))
#df_feat['production_companies1'] = df_feat['production_companies'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [14]:
df_feat['genres1'] = df_feat['genres'].apply(ast.literal_eval)

df_feat['genres1'] = df_feat['genres1'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [15]:
df_feat['keywords1'] = df_feat['keywords'].apply(ast.literal_eval)

df_feat['keywords1'] = df_feat['keywords1'].apply(lambda x: " ".join(x))

df_feat['production_companies1'] = df_feat['production_companies'].apply(ast.literal_eval)

df_feat['production_companies1'] = df_feat['production_companies1'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable

In [16]:
df_feat.head()

Unnamed: 0,belongs_to_collection,budget,genres,homepage,title,overview,production_companies,release_date,tagline,runtime,keywords,cast,crew,director,success_binary,cast1,crew1,genres1,keywords1,production_companies1
1,0,65000000.0,"['Adventure', 'Fantasy', 'Family']",0,Jumanji,When siblings Judy and Peter discover an encha...,"['TriStar Pictures', 'Teitler Film', 'Intersco...",1995-12-15,Roll the dice and unleash the excitement!,104.0,"['board game', 'disappearance', ""based on chil...","['Robin Williams', 'Jonathan Hyde', 'Kirsten D...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",Joe Johnston,1,Robin Williams Jonathan Hyde Kirsten Dunst Bra...,Larry J. Franco Jonathan Hensleigh James Horne...,Adventure Fantasy Family,board game disappearance based on children's b...,TriStar Pictures Teitler Film Interscope Commu...
3,0,16000000.0,"['Comedy', 'Drama', 'Romance']",0,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",['Twentieth Century Fox Film Corporation'],1995-12-22,Friends are the people who let you be yourself...,127.0,"['based on novel', 'interracial relationship',...","['Whitney Houston', 'Angela Bassett', 'Loretta...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...",Forest Whitaker,1,Whitney Houston Angela Bassett Loretta Devine ...,Forest Whitaker Ronald Bass Ronald Bass Ezra S...,Comedy Drama Romance,based on novel interracial relationship single...,Twentieth Century Fox Film Corporation
5,0,60000000.0,"['Action', 'Crime', 'Drama', 'Thriller']",0,Heat,"Obsessive master thief, Neil McCauley leads a ...","['Regency Enterprises', 'Forward Pass', 'Warne...",1995-12-15,A Los Angeles Crime Saga,170.0,"['robbery', 'detective', 'bank', 'obsession', ...","['Al Pacino', 'Robert De Niro', 'Val Kilmer', ...","[Michael Mann, Michael Mann, Art Linson, Micha...",Michael Mann,1,Al Pacino Robert De Niro Val Kilmer Jon Voight...,Michael Mann Michael Mann Art Linson Michael M...,Action Crime Drama Thriller,robbery detective bank obsession chase shootin...,Regency Enterprises Forward Pass Warner Bros.
8,0,35000000.0,"['Action', 'Adventure', 'Thriller']",0,Sudden Death,International action superstar Jean Claude Van...,"['Universal Pictures', 'Imperial Entertainment...",1995-12-22,Terror goes into overtime.,106.0,"['terrorist', 'hostage', 'explosive', 'vice pr...","['Jean-Claude Van Damme', 'Powers Boothe', 'Do...","[Peter Hyams, Karen Elise Baldwin, Gene Quinta...",Peter Hyams,0,Jean-Claude Van Damme Powers Boothe Dorian Har...,Peter Hyams Karen Elise Baldwin Gene Quintano ...,Action Adventure Thriller,terrorist hostage explosive vice president,Universal Pictures Imperial Entertainment Sign...
9,1,58000000.0,"['Adventure', 'Action', 'Thriller']",1,GoldenEye,James Bond must unmask the mysterious head of ...,"['United Artists', 'Eon Productions']",1995-11-16,No limits. No fears. No substitutes.,130.0,"['cuba', 'falsely accused', 'secret identity',...","['Pierce Brosnan', 'Sean Bean', 'Izabella Scor...","[Martin Campbell, Ian Fleming, Jeffrey Caine, ...",Martin Campbell,1,Pierce Brosnan Sean Bean Izabella Scorupco Fam...,Martin Campbell Ian Fleming Jeffrey Caine Bruc...,Adventure Action Thriller,cuba falsely accused secret identity computer ...,United Artists Eon Productions


In [17]:
df_feat.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [18]:
df_feat = df_feat.reset_index(drop=True)

Now I will chose the folowing columns as a features and combine them together.

In [19]:
df_feat['text'] = df_feat['title'] + df_feat['overview'] + df_feat['tagline'] + df_feat['director'] + df_feat['cast1'] + df_feat['genres1'] + df_feat['keywords1'] + df_feat['production_companies1'] + df_feat['crew1']

In [20]:
df_feat.dropna(inplace=True)

In [21]:
df_feat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4588 entries, 0 to 4587
Data columns (total 21 columns):
belongs_to_collection    4588 non-null int64
budget                   4588 non-null float64
genres                   4588 non-null object
homepage                 4588 non-null int64
title                    4588 non-null object
overview                 4588 non-null object
production_companies     4588 non-null object
release_date             4588 non-null datetime64[ns]
tagline                  4588 non-null object
runtime                  4588 non-null float64
keywords                 4588 non-null object
cast                     4588 non-null object
crew                     4588 non-null object
director                 4588 non-null object
success_binary           4588 non-null int64
cast1                    4588 non-null object
crew1                    4588 non-null object
genres1                  4588 non-null object
keywords1                4588 non-null object
production_c

In [22]:
df_feat.reset_index(inplace=True)

Assigning our target

In [23]:
y = df_feat['success_binary']

First let's try CountVectorizer

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
stop = list(ENGLISH_STOP_WORDS)


cv = CountVectorizer(min_df=3, stop_words=stop)


cv_arr = cv.fit_transform(df_feat['text'])

df_vect = pd.DataFrame(cv_arr.toarray(), columns=cv.get_feature_names())
df_vect.shape

(4588, 30385)

Because this amout of data will takes forever to proceed with my current computational power, I will use TruncatedSVD in order to reduce it.

In [50]:
documents = df_feat['text'].copy()

vectorizer = CountVectorizer(stop_words='english')

svd_model = TruncatedSVD(n_components=2000)

svd_transformer = Pipeline([('CountVect', vectorizer), 
                            ('svd', svd_model)])

svd_matrix = svd_transformer.fit_transform(documents) 

truncated = pd.DataFrame(svd_matrix)

Now let's try logistic regression, KFold and RandomForestClassifier

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(truncated,
                                                    y,
                                                    random_state = 42)

In [52]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.9712292938099389
0.5666957279860506


In [53]:
kf = KFold(n_splits=5, shuffle=True,random_state=42)
logreg = LogisticRegression(random_state=42)
print(cross_val_score(logreg,X_train,y_train,cv=kf).mean())
print(cross_val_score(logreg,X_test,y_test,cv=kf).mean())

0.5850027002396463
0.5893335864818682


In [54]:
rf = RandomForestClassifier(random_state=42)
cross_val_score(rf,truncated,y,cv=kf).mean()

0.5385794351667723

This results does't make us happy, so let's try GridSearch with KNeighborsClassifier and VoringClassifier with AdaBoostClassifier and GradientBoostingClassifier.

In [55]:
knn_params = {
    'n_neighbors': [5,10,20],
    'weights': ['distance'],
    'metric': ['manhattan']
}

grid = GridSearchCV(KNeighborsClassifier(),
                    knn_params,
                    cv=3,
                    verbose = 1,
                   return_train_score = True)

grid.fit(truncated, y)
grid.best_score_

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  7.2min finished


0.486922406277245

In [56]:
vote = VotingClassifier([
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
])
vote.fit(X_train, y_train)
print(vote.score(X_train, y_train))
print(vote.score(X_test, y_test))

0.8294100552165068
0.5701830863121186


  if diff:
  if diff:


All this scores dosn't make us happy, so let's repeat all this process with TF-IDF Vectorizer.

In [57]:
vectorizer = TfidfVectorizer(stop_words='english')
model = vectorizer.fit_transform(df_feat['text'])
tfidf = pd.DataFrame(model.todense(),columns=vectorizer.get_feature_names())
tfidf.shape

(4588, 100058)

In [58]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

documents = df_feat['text'].copy()

vectorizer = TfidfVectorizer(stop_words='english', 
                             use_idf=True, 
                             smooth_idf=True)

svd_model = TruncatedSVD(n_components=2000, algorithm='arpack')

svd_transformer = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

svd_matrix = svd_transformer.fit_transform(documents) 

truncated = pd.DataFrame(svd_matrix)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(truncated,
                                                    y,
                                                    random_state = 42)

In [60]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

print(lr.score(X_train, y_train))

print(lr.score(X_test, y_test))

0.8561464690496948
0.6233653007846556


In [61]:
kf = KFold(n_splits=5, shuffle=True,random_state=42)
logreg = LogisticRegression(random_state=42)
print(cross_val_score(logreg,X_train,y_train,cv=kf).mean())
print(cross_val_score(logreg,X_test,y_test,cv=kf).mean())

0.6384750396597698
0.5858211505600911


In [62]:
rf = RandomForestClassifier(random_state=42)
cross_val_score(rf,truncated,y,cv=kf).mean()

0.5383613326585935

In [63]:
knn_params = {
    'n_neighbors': [5,10,20],
    'weights': ['distance'],
    'metric': ['manhattan']
}

grid = GridSearchCV(KNeighborsClassifier(),
                    knn_params,
                    cv=3,
                    verbose = 1,
                   return_train_score = True)

grid.fit(truncated, y)
grid.best_score_

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  6.7min finished


0.4775501307759372

In [64]:
vote = VotingClassifier([
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
])
vote.fit(X_train, y_train)
print(vote.score(X_train, y_train))
print(vote.score(X_test, y_test))

0.8311537343795409
0.6094158674803836


  if diff:
  if diff:


### After trying many different combinations of parameters for all this models the best score is around  0.6233. 
### There is still a big room for experiments, with different combinations of features, stop-words and parameters. Also I think that creating dataset of raitings of actors and directors and even other members of the crew will help a lot.

### Next most exciting step of this project is [NLP modeling with entire movie scripts](./3_NLP_Scripts.ipynb).