# Modeling with scripts

First let's import necessary libraries and datasets.

In [None]:
import datetime
import pandas as pd
import numpy as np
import requests
import time
import regex as re
import nltk
from tqdm import tqdm
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics import accuracy_score, f1_score, classification_report
from nltk.tokenize.regexp import RegexpTokenizer
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from bs4 import BeautifulSoup
import multiprocessing
from nltk.stem import PorterStemmer,WordNetLemmatizer
import nltk
from progressbar import ProgressBar
np.random.seed(42)

In this notebook I will use data sets of the movie scripts from www.imsdb.com in form of 1093 .txt files, it's budgets and revenues from www.the-numbers.com/movie/budgets in combination with budgets and revenues from metadata set I already used in previous two notebooks.

In [32]:
df_budget = pd.read_csv('../Data/Budgets_5686 - Budgets.csv')
df_budget.head()

Unnamed: 0,0.09236111111,3500000,0,0.1
0,0.4680555556,6000000,0,0
1,3.0,7200000,59774,295492
2,9.0,30000000,31749894,48559999
3,21.0,35000000,81159365,159846429
4,31.0,1500000,779820,922727


Need to clean it, rename columns and replace some zeros in 'budget' and 'revenue' columns.

In [33]:
df_budget = df_budget.rename(index=str, columns={"0.09236111111": "title", "3500000": "budget",
                             "0": "usa", "0.1": "revenue"})

In [34]:
df_budget.drop(columns=['usa'], inplace=True)

In [35]:
df_budget['revenue'] = df_budget['revenue'].replace(0, np.nan)
df_budget['budget'] = df_budget['budget'].replace(0, np.nan)
df_budget.dropna(inplace = True)
df_budget.shape

(5330, 3)

In [36]:
df_budget.tail()

Unnamed: 0,title,budget,revenue
5680,Zootopia,150000000,1019704000.0
5681,Zulu,16000000,1844228.0
5682,Zwartboek,22000000,27238350.0
5683,ПОСЛЕДНИЙ БОГАТЫРЬ,8500000,30618270.0
5684,長江七號 (CJ7),20000000,47300770.0


## Now Movies Data base

In [37]:
movies = pd.read_csv('../Data/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [38]:
movies.drop(movies.index[19730],inplace=True)
movies.drop(movies.index[29502],inplace=True)
movies.drop(movies.index[35585],inplace=True)

In [39]:
movies['budget'] = pd.to_numeric(movies['budget'], errors='coerce')

In [40]:
movies['revenue'] = movies['revenue'].replace(0, np.nan)
movies['budget'] = pd.to_numeric(movies['budget'], errors='coerce')
movies['budget'] = movies['budget'].replace(0, np.nan)

In [41]:
df_budget1 = movies[['title', 'budget', 'revenue']]
df_budget1.head()

Unnamed: 0,title,budget,revenue
0,Toy Story,30000000.0,373554033.0
1,Jumanji,65000000.0,262797249.0
2,Grumpier Old Men,,
3,Waiting to Exhale,16000000.0,81452156.0
4,Father of the Bride Part II,,76578911.0


In [42]:
df_budget1.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
df_budget1.head()

Unnamed: 0,title,budget,revenue
0,Toy Story,30000000.0,373554033.0
1,Jumanji,65000000.0,262797249.0
3,Waiting to Exhale,16000000.0,81452156.0
5,Heat,60000000.0,187436818.0
8,Sudden Death,35000000.0,64350171.0


In [44]:
df_budget1.shape

(5381, 3)

## Now merge 2 dfs with budgets

In [45]:
df_budget.shape

(5330, 3)

In [46]:
df_budget1.shape

(5381, 3)

In [47]:
df_combined_budgets = pd.concat([df_budget, df_budget1], axis =0).drop_duplicates(subset=['title'])
df_combined_budgets.head()

Unnamed: 0,title,budget,revenue
1,3,7200000.0,295492.0
2,9,30000000.0,48559999.0
3,21,35000000.0,159846429.0
4,31,1500000.0,922727.0
5,42,31000000.0,97470701.0


In [48]:
df_combined_budgets[df_combined_budgets['title'] == 'Madea\'s Family Reunion']

Unnamed: 0,title,budget,revenue
2656,Madea's Family Reunion,10000000.0,63320521.0


In [49]:
df_combined_budgets.shape

(7315, 3)

## Now let's get all movie scripts and put them in the dataset. Also will get the titles and clean them properly.

In [50]:
import os
paths = [i for i in os.listdir('./movie_scripts/')]
len(paths)

1093

In [51]:
list_of_scripts = []

for i in paths[1:-1]: 
    with open('./movie_scripts/' + i, 'r') as f:
        raw = f.read()
    list_of_scripts.append((raw,i))

In [52]:
scripts = pd.DataFrame(list_of_scripts, columns=['script', 'title'])
scripts.head()

Unnamed: 0,script,title
0,TEN THINGS I HATE ABOUT YOU written by Karen M...,Script_10 Things I Hate About You.txt
1,...,Script_12 and Holding.txt
2,TWELVE MONKEYS An original screenplay by David...,Script_12 Monkeys.txt
3,12 YEARS A SLAVE ...,Script_12 Years a Slave.txt
4,CUT FROM BLACKTITLE: FINEXTERIOR - LA - DAYFin...,Script_12.txt


In [53]:
scripts.shape

(1091, 2)

In [54]:
scripts['title'] = [x.replace('Script_', '') for x in scripts['title']]
scripts['title'] = [x.replace('.txt', '') for x in scripts['title']]
scripts.head()

Unnamed: 0,script,title
0,TEN THINGS I HATE ABOUT YOU written by Karen M...,10 Things I Hate About You
1,...,12 and Holding
2,TWELVE MONKEYS An original screenplay by David...,12 Monkeys
3,12 YEARS A SLAVE ...,12 Years a Slave
4,CUT FROM BLACKTITLE: FINEXTERIOR - LA - DAYFin...,12


In [55]:
df_merge = scripts.merge(df_combined_budgets, on=['title'] ,how="left")
df_merge.shape

(1091, 4)

In [56]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1091 entries, 0 to 1090
Data columns (total 4 columns):
script     1091 non-null object
title      1091 non-null object
budget     642 non-null float64
revenue    642 non-null float64
dtypes: float64(2), object(2)
memory usage: 42.6+ KB


After merging all three with each other on the 'title' column I got 642 observation with no missing data. Let's save it, just in case...

In [90]:
df_merge.to_csv('../Data/Script_budget.csv',index=False)

### Now let's do modeling

In [63]:
df_merge = pd.read_csv('../Data/Script_budget.csv')
df_merge.head()

Unnamed: 0,script,title,budget,revenue
0,TEN THINGS I HATE ABOUT YOU written by Karen M...,10 Things I Hate About You,13000000.0,60414025.0
1,...,12 and Holding,,
2,TWELVE MONKEYS An original screenplay by David...,12 Monkeys,29000000.0,168841459.0
3,12 YEARS A SLAVE ...,12 Years a Slave,20000000.0,181025343.0
4,CUT FROM BLACKTITLE: FINEXTERIOR - LA - DAYFin...,12,4000000.0,7537453.0


In [64]:
df_scripts = df_merge.dropna()

#### Let's create AGAIN a binary column, that indicates if the movie made any profit or not and called it success_binary again by this formula: (Gross - Release_expenses) / Budget

In [65]:
df_scripts['success'] = (df_scripts['revenue'] - df_scripts['budget']) / df_scripts['budget']
df_scripts['success_binary']= [1 if x > 1 else 0 for x in df_scripts['success']]
df_scripts.drop(columns=['revenue', 'budget', 'success'], inplace=True)
df_scripts.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,script,title,success_binary
0,TEN THINGS I HATE ABOUT YOU written by Karen M...,10 Things I Hate About You,1
2,TWELVE MONKEYS An original screenplay by David...,12 Monkeys,1
3,12 YEARS A SLAVE ...,12 Years a Slave,1
4,CUT FROM BLACKTITLE: FINEXTERIOR - LA - DAYFin...,12,0
5,127 HOURS ...,127 Hours,1


In [66]:
df_scripts['success_binary'].mean()

0.6152647975077882

Our benchmark is 0.615. Let's assigning our target and combine scripts with titles.

In [73]:
y = df_scripts['success_binary']

In [67]:
df_scripts['text'] = df_scripts['title'] + df_scripts['script']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


#### First let's try CountVectorizer

In [83]:
stop = list(ENGLISH_STOP_WORDS)


cv = CountVectorizer(min_df=3,stop_words=stop)


cv_arr = cv.fit_transform(df_scripts['text'])

df_vect = pd.DataFrame(cv_arr.toarray(), columns=cv.get_feature_names())
df_vect.shape

(642, 45486)

In [84]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_vect,
                                                    y,
                                                    random_state = 42)

#### And again logistic regression, KFold and RandomForestClassifier

In [85]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [86]:
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

1.0
0.5527950310559007


In [87]:
kf = KFold(n_splits=5, shuffle=True,random_state=42)
logreg = LogisticRegression(random_state=42)
print(cross_val_score(logreg,X_train,y_train,cv=kf).mean())
print(cross_val_score(logreg,X_test,y_test,cv=kf).mean())

0.5280713058419244
0.6456439393939395


In [88]:
rf = RandomForestClassifier(random_state=42)
cross_val_score(rf,df_vect,y,cv=kf).mean()

0.5452398255813954

#### And GridSearch with KNeighborsClassifier and VoringClassifier with AdaBoostClassifier and GradientBoostingClassifier.

In [89]:
knn_params = {
    'n_neighbors': [5,15,21],
    'weights': ['distance'],
    'metric': ['manhattan']
}

grid = GridSearchCV(KNeighborsClassifier(),
                    knn_params,
                    cv=3,
                    verbose = 1,
                   return_train_score = True)

grid.fit(df_vect, y)
grid.best_score_

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  3.0min finished


0.6152647975077882

In [93]:
vote = VotingClassifier([
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
])
vote.fit(X_train, y_train)
print(vote.score(X_train, y_train))
print(vote.score(X_test, y_test))

  if diff:


0.9896049896049897
0.5217391304347826


  if diff:


### Unfortunaly none of this models showed any promising results. Logistic regression with KFold split showed the highes 0.645 cross validation score, but it is not the result we want, considering our 0.615 benchmark.

### Finally I will try most advanced NLP technique - DOCtoVEC

First let's prepare our data. Split everything into separate words, clean unnecessary symbols and bring everything to the lower case. 

In [113]:
df_scripts_doc = df_scripts[['text', 'success_binary']]

In [114]:
df_scripts_doc.index = range(642)
df_scripts_doc['text'].apply(lambda x: len(x.split(' '))).sum()

51786121

Eventually we have 51+ millions words

In [117]:
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('---', '')
    return text
df_scripts_doc['text'] = df_scripts_doc['text'].apply(cleanText)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [120]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

Now let's use train/test split, remove stop-words and tokenize text using NLTK tokenizer for our text data and target for training and testing set.

In [138]:
train, test = train_test_split(df_scripts_doc, random_state=42)
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.success_binary]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.success_binary]), axis=1)

For this kind of model I would need all my computational power and I will make sure that I use all my worker threads to train the model.

In [123]:
cores = multiprocessing.cpu_count()

First let's try Distributed Bag of Words (DBOW).

In [170]:
model_dbow = Doc2Vec(dm=0, vector_size=300,
                     negative=1, window=1, hs=0, min_count=1, sample = 1, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 449/449 [00:00<00:00, 358371.55it/s]


In [171]:
%%time

model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=100)
model_dbow.alpha -= 0.001
model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 449/449 [00:00<00:00, 294440.67it/s]


CPU times: user 7min 27s, sys: 14 s, total: 7min 41s
Wall time: 6min 25s


In [172]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

And applying Logistic Regression

In [173]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.5595854922279793
Testing F1 score: 0.550593943229311


## Now repeat this process with Distributed Memory using 300 dimensions and 30 epochs

In [130]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 449/449 [00:00<00:00, 323192.47it/s]


In [131]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 449/449 [00:00<00:00, 182892.35it/s]
100%|██████████| 449/449 [00:00<00:00, 705863.00it/s]
100%|██████████| 449/449 [00:00<00:00, 617659.07it/s]
100%|██████████| 449/449 [00:00<00:00, 897637.03it/s]
100%|██████████| 449/449 [00:00<00:00, 822376.64it/s]
100%|██████████| 449/449 [00:00<00:00, 800358.05it/s]
100%|██████████| 449/449 [00:00<00:00, 649842.13it/s]
100%|██████████| 449/449 [00:00<00:00, 803431.10it/s]
100%|██████████| 449/449 [00:00<00:00, 933684.93it/s]
100%|██████████| 449/449 [00:00<00:00, 920450.88it/s]
100%|██████████| 449/449 [00:00<00:00, 505568.46it/s]
100%|██████████| 449/449 [00:00<00:00, 571372.12it/s]
100%|██████████| 449/449 [00:00<00:00, 720720.43it/s]
100%|██████████| 449/449 [00:00<00:00, 774041.31it/s]
100%|██████████| 449/449 [00:00<00:00, 1180716.30it/s]
100%|██████████| 449/449 [00:00<00:00, 965765.38it/s]
100%|██████████| 449/449 [00:00<00:00, 713619.74it/s]
100%|██████████| 449/449 [00:00<00:00, 179305.20it/s]
100%|██████████| 449/449 [0

CPU times: user 17min 17s, sys: 9.7 s, total: 17min 27s
Wall time: 4min 47s


In [132]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.6113989637305699
Testing F1 score: 0.6108140092107839


With Distributed memory and Logistic Regression we got the result that is equal to our benchmark...

But! According to Gensim doc2vec tutorial on the IMDB sentiment data set, combining a paragraph vector from Distributed Bag of Words (DBOW) and Distributed Memory (DM) improves performance. And that's exactly what we will do.

In [133]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [135]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [136]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [137]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.6113989637305699
Testing F1 score: 0.6082093207746456


This result also doesn't makes us happy. Let's try and play with different parameters.

## Big Models

In [143]:
sizes = [50, 150, 250, 300]
windows = [1, 5, 10]
counts = [1,5,10]

In [144]:
summaries_dbow = []
t0 = time.time()
for size in sizes:
    for window in windows:
        for count in counts:
            model_dbow = Doc2Vec(dm=0, vector_size=size, negative=5, window=window, 
                                 hs=0, min_count=count, sample = 0, workers=cores)
            model_dbow.build_vocab([x for x in train_tagged.values])
            
            model_dbow.train(utils.shuffle([x for x in train_tagged.values]), 
                                 total_examples=len(train_tagged.values), epochs=30)            


            y_train, X_train = vec_for_learning(model_dbow, train_tagged)
            y_test, X_test = vec_for_learning(model_dbow, test_tagged)
            logreg = LogisticRegression(n_jobs=1, C=1e5)
            logreg.fit(X_train, y_train)
            
            
            y_train_pred = logreg.predict(X_train)
            y_test_pred = logreg.predict(X_test)

            
            summary = {}
            summary['Size']      = size
            summary['Window']    = window
            summary['Count']     = count
            summary['Train_Acc'] = accuracy_score(y_train, y_train_pred)
            summary['Test_Acc']  = accuracy_score(y_test, y_test_pred)
            summary['Train_F1']  = f1_score(y_train, y_train_pred)
            summary['Test_F1']   = f1_score(y_test, y_test_pred)
            summary['CV_Score']  = cross_val_score(logreg, X_train, y_train, cv = 3).mean()
            summary['Train_Report'] = classification_report(y_train,y_train_pred)
            summary['Test_Report'] = classification_report(y_test,y_test_pred)
            
            print('Size = {}. Window = {}, Count = {}'.format(size,window,count))
            print('Training accuracy %s' % accuracy_score(y_train, y_train_pred))
            print('Training F1 score: {}'.format(f1_score(y_train, y_train_pred, average='weighted')))
            print('Testing accuracy %s' % accuracy_score(y_test, y_test_pred))
            print('Testing F1 score: {}'.format(f1_score(y_test, y_test_pred, average='weighted')))
            
            
            summaries_dbow.append(summary)

print(time.time()-t0)
summaries_dbow_df = pd.DataFrame(summaries_dbow)

Size = 50. Window = 1, Count = 1
Training accuracy 1.0
Training F1 score: 1.0
Testing accuracy 0.6683937823834197
Testing F1 score: 0.6364745854439074
Size = 50. Window = 1, Count = 5
Training accuracy 0.9732739420935412
Training F1 score: 0.9732739420935412
Testing accuracy 0.5958549222797928
Testing F1 score: 0.5946110378942427
Size = 50. Window = 1, Count = 10
Training accuracy 0.9510022271714922
Training F1 score: 0.950951645529052
Testing accuracy 0.616580310880829
Testing F1 score: 0.5947565238705043
Size = 50. Window = 5, Count = 1
Training accuracy 1.0
Training F1 score: 1.0
Testing accuracy 0.6476683937823834
Testing F1 score: 0.6251470695279878
Size = 50. Window = 5, Count = 5
Training accuracy 0.9420935412026726
Training F1 score: 0.9420935412026726
Testing accuracy 0.6062176165803109
Testing F1 score: 0.5326964306786016
Size = 50. Window = 5, Count = 10
Training accuracy 0.888641425389755
Training F1 score: 0.8885264671114816
Testing accuracy 0.5906735751295337
Testing F1 s

In [145]:
summaries_dbow_df.sort_values(by='Test_Acc',ascending=False)

Unnamed: 0,CV_Score,Count,Size,Test_Acc,Test_F1,Test_Report,Train_Acc,Train_F1,Train_Report,Window
0,0.877554,1,50,0.668394,0.769784,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,1
3,0.88862,1,50,0.647668,0.748148,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,5
33,0.995526,1,300,0.621762,0.724528,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,10
2,0.841879,10,50,0.61658,0.723881,precision recall f1-score s...,0.951002,0.959854,precision recall f1-score s...,1
17,0.962163,10,150,0.61658,0.715385,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,10
30,0.995541,1,300,0.611399,0.719101,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,5
12,0.986652,1,150,0.611399,0.716981,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,5
8,0.832975,10,50,0.606218,0.737931,precision recall f1-score s...,0.928731,0.940741,precision recall f1-score s...,10
4,0.852975,5,50,0.606218,0.741497,precision recall f1-score s...,0.942094,0.952381,precision recall f1-score s...,5
28,0.971051,5,300,0.606218,0.707692,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,1


The best testing result is 0.668394. Now will repeat everying for DM model.

In [146]:
summaries_dm = []
t0 = time.time()
for size in sizes:
    for window in windows:
        for count in counts:
            model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=size, window=window, 
                                negative=5, min_count=count, workers=cores, alpha=0.065, min_alpha=0.065)
            model_dmm.build_vocab(list(train_tagged.values))
            model_dmm.train(utils.shuffle(list(train_tagged.values)), 
                            total_examples=len(train_tagged.values), epochs=30)

            y_train, X_train = vec_for_learning(model_dmm, train_tagged)
            y_test, X_test = vec_for_learning(model_dmm, test_tagged)
            logreg = LogisticRegression(n_jobs=1, C=1e5)
            logreg.fit(X_train, y_train)


            y_train_pred = logreg.predict(X_train)
            y_test_pred = logreg.predict(X_test)


            summary = {}
            summary['Size']      = size
            summary['Window']    = window
            summary['Count']     = count
            summary['Train_Acc'] = accuracy_score(y_train, y_train_pred)
            summary['Test_Acc']  = accuracy_score(y_test, y_test_pred)
            summary['Train_F1']  = f1_score(y_train, y_train_pred)
            summary['Test_F1']   = f1_score(y_test, y_test_pred)
            summary['CV_Score']  = cross_val_score(logreg, X_train, y_train, cv = 3).mean()
            summary['Train_Report'] = classification_report(y_train,y_train_pred)
            summary['Test_Report'] = classification_report(y_test,y_test_pred)

            print('Size = {}. Window = {}, Count = {}'.format(size,window,count))
            print('Training accuracy %s' % accuracy_score(y_train, y_train_pred))
            print('Training F1 score: {}'.format(f1_score(y_train, y_train_pred, average='weighted')))
            print('Testing accuracy %s' % accuracy_score(y_test, y_test_pred))
            print('Testing F1 score: {}'.format(f1_score(y_test, y_test_pred, average='weighted')))

            summaries_dm.append(summary)

print(time.time()-t0)
summaries_dm_df = pd.DataFrame(summaries_dm)

Size = 50. Window = 1, Count = 1
Training accuracy 0.7572383073496659
Training F1 score: 0.7532728239572404
Testing accuracy 0.5233160621761658
Testing F1 score: 0.4961837864336
Size = 50. Window = 1, Count = 5
Training accuracy 0.734966592427617
Training F1 score: 0.7314672143260681
Testing accuracy 0.6113989637305699
Testing F1 score: 0.5790378225611386
Size = 50. Window = 1, Count = 10
Training accuracy 0.7527839643652561
Training F1 score: 0.7483394521498976
Testing accuracy 0.5699481865284974
Testing F1 score: 0.5693008368599342
Size = 50. Window = 5, Count = 1
Training accuracy 0.779510022271715
Training F1 score: 0.7759083446951083
Testing accuracy 0.5440414507772021
Testing F1 score: 0.5180888391973566
Size = 50. Window = 5, Count = 5
Training accuracy 0.77728285077951
Training F1 score: 0.7730913958483807
Testing accuracy 0.5284974093264249
Testing F1 score: 0.5091120049678403
Size = 50. Window = 5, Count = 10
Training accuracy 0.7461024498886414
Training F1 score: 0.742940755

In [147]:
summaries_dm_df.sort_values(by='Test_Acc',ascending=False)

Unnamed: 0,CV_Score,Count,Size,Test_Acc,Test_F1,Test_Report,Train_Acc,Train_F1,Train_Report,Window
8,0.699359,10,50,0.61658,0.725926,precision recall f1-score s...,0.77951,0.823529,precision recall f1-score s...,10
35,0.926458,10,300,0.611399,0.719101,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,10
10,0.608024,5,150,0.611399,0.705882,precision recall f1-score s...,0.899777,0.91833,precision recall f1-score s...,1
1,0.63918,5,50,0.611399,0.727273,precision recall f1-score s...,0.734967,0.790123,precision recall f1-score s...,1
16,0.717136,5,150,0.595855,0.692913,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,10
34,0.955436,5,300,0.590674,0.70412,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,10
12,0.65478,1,150,0.590674,0.70632,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,5
19,0.676987,5,250,0.585492,0.672131,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,1
13,0.719374,5,150,0.580311,0.684825,precision recall f1-score s...,1.0,1.0,precision recall f1-score s...,5
14,0.634661,10,150,0.580311,0.696629,precision recall f1-score s...,0.919822,0.934783,precision recall f1-score s...,5


Best result for DM model is 0.616580. The last hope is to run both models again with best parameters and then combining a paragraph vector from both of them.

In [153]:
model_dbow = Doc2Vec(dm=0, vector_size=50, negative=5, window=1, 
                     hs=0, min_count=1, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in train_tagged.values])

model_dbow.train(utils.shuffle([x for x in train_tagged.values]), 
                                 total_examples=len(train_tagged.values), epochs=30)

In [154]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=50, window=10, 
                    negative=5, min_count=10, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in train_tagged.values])
model_dmm.train(utils.shuffle([x for x in train_tagged.values]), 
                            total_examples=len(train_tagged.values), epochs=30)

In [155]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])


In [156]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [157]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)
logreg.fit(X_train, y_train)
y_test_pred = logreg.predict(X_test)
y_train_pred = logreg.predict(X_train)

print('Testing accuracy %s' % accuracy_score(y_test, y_test_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_test_pred, average='weighted')))
print('Training accuracy %s' % accuracy_score(y_train, y_train_pred))
print('Training F1 score: {}'.format(f1_score(y_train, y_train_pred, average='weighted')))

Testing accuracy 0.5699481865284974
Testing F1 score: 0.566418314990608
Training accuracy 1.0
Training F1 score: 1.0


Testing score is 0.5699, even lower than benchmark.

# Executive Summary

None of the most advanced models I used were able to identify any paterns for predicting funancial success with such a thin metter as the art of writing the movie script. As I said before there is still a big room for experiments, with different combinations of features, stop-words and parameters. Mining more data, having a team of people and much more computational power, having some linguists and writers on the team definitely might helps.  
There might be separate models for each genre, to find the pattern in the scripts of the same type.  
Or even create dataset with movie scripts, divided not by the genre as we use to know them, but by the story types, and there is 10 of them according to [Blake Snyder](http://www.savethecat.com/). Devided by what exactly going on with the main character and how he change during the movie. Short example: Die Hard, Shindler's List and Terminator is the same type of the story, even it is completely different genres. Blake Snyder calls this type - "Dude with a problem". It consist INNOCENT HERO, SUDDEN EVENT and LIFE OR DEATH BATTLE Absolutely ordinary person (police man, director of some factory in Poland or weiter in the diner) got into absolutely extraordinary situation - terrorists capture the building, nazis dragging jewish friends to extermination camp or robot from the future (with accent!) trying to kill her and her unborn (and not even conceived yet) child!  
So the scripts might be devided by the story types and then processed through the models. Than extracting just a verbs will helps to determing what's going on with the main character. And if this character's type belongs to this story's type, then we will be able to say that this movie will be successful. And if you will remember a lot of successful movies with most successful actors - their types totaly match the types of the stories they play.  
Also patterns might be found while exploring the structures of the plots. Most successful movie's plots matching with "the perfect structure" with 3 main parts and other important bits. Back in a days I did some analysis about that matter and found that most of the successful movies does that. And the champion of this "matching" is... Steven Spielberg, one of the most successful directors ever.
Again this is just a theories, that requers a lot of men/hours and computational power to be proved ot disproved.  
I will gladly continue my research in this direction, because the fruit at the end is priceless, especially for movie and data enthusiasts like myself in particular and moviemakers in general.  
As for now we still can learn a lot from the old good exploratory analysis and visualizations combined with life experience, knowledge of the industry and common sense.  
In the first notebook with EDA I found that all the movies we explored might be devided by 3 huge clasters.  
First one contains super-expensive blockbusters, animations, anventures, fantasy, sci-fi and family movies in general. Mostly been shot by the same companies, same directors with the same actors and released right before or at the beginning of the summer, so all kids and their young parents can enjoy it. They generating the biggest revenues of all times that might be compare with GDP of come countries. And they definitely worth it. But usualy this revenues equal to just few of it's budgets, which is still tremendous amout of money, but the risks is high too.  
Second cluster contains mid-budget movies - dramas, comedies, actions and historical films, made by directors that already proved themselves, with good and expensive cast, released near the autumn or December, probably colser to awards ceremonies. Because lot of them definitely represents a high artistic value. Most of the legendary movies we know, that survived decades and still fascinates hundreds of millions people is belong to that claster. They accumulate very high revenues and I would say that they have the perfect ballance between budgets, revenues and amouts of the budgets that returns to their creators. But they still risky for investors, because art is very unpredictable and subjective.  
Finally the third claster contains low-budget horrors (mostly), mysteries and even dramas, that were writen by unknown writers, shot by unknow directors with unkown actors (and sometimes it is the same person) with unknown devices (even smartphones this days). But! Because their budgets is so low, their success sometimes makes very jealous even thouse, who sold their bitcoins in December of 2017. The best examples is "Paranormal activity" and "The Blair Witch Project", with 15k or 60k budgets and 200+ millions of dollars revenue! With no special effects and interesting twisted plots this movies serves as a great springboard for everyone, who taking part in it's creation.  
Usualy moviemakers (writers, directors, producers and actors) of successful low budget movies demonstrate perfect transition through this clasters, from third to the first one. My favorite examples of such transitions is: Sylvester Stallone, Matt Damon, Harrison Ford, Arnold Schwarzenegger of course (and not only because of accent and immigration history) and many many others. I would like to bring visual evample of it, based on the numbers:  
  
### James Cameron  
  
  
![image](../Visualizations/James_cameron.png)  
  
  
### Darren Aronofsky  
  
  
![image](../Visualizations/Aranofsky.png)  
  
    
# Epilogue
  

With this being said I wish each and everyone who involved into the process of the creation of the movies - inspiration and courage on their not easy path of bringing joy and happines, excitement and inspiration to the people all over the world. And I, personaly will definitely keep doing this.  
P.S. And don't forget to make sure that your story will have a continuation in order to be the part of collection...  
P.P.S. And of course make a homepage for your movie!

                                                                         to be continued...