In [1]:
import json
import pandas as pd
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
# setup a new hash to store the results in
def script_cleaning(script):
    processed_article_hash = {}
    porter_stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    # iterate through the keys, i.e. document ids, in the hash to pull out the stored text and process
    for key in script.keys():
        text_of_article = script[key]
        word_tokens = word_tokenize(text_of_article)
        words = [word for word in word_tokens if word.isalpha()]
        words = [str.lower(w) for w in words if not str.lower(w) in stop_words]
        processed_article_hash[key] = [porter_stemmer.stem(word) for word in words]
    return processed_article_hash

def cosine_similarity(document_1_data, document_2_data):
    document_vector_word_index = list(set.union(set(document_1_data),set(document_2_data))) 
    document_1_vector = np.array([document_1_data.count(word) for word in document_vector_word_index])
    document_2_vector = np.array([document_2_data.count(word) for word in document_vector_word_index])
    dot_product_of_two_document_vectors = document_1_vector.dot(document_2_vector)/(np.sqrt(np.dot(document_1_vector,document_1_vector)) * np.sqrt(np.dot(document_2_vector,document_2_vector)))
    return dot_product_of_two_document_vectors 

def prep_data_structs(processed_article_hash):
    data_structure_for_cosine_similarity = {}#

    for doc_1_key in processed_article_hash.keys():
        data_structure_for_cosine_similarity[doc_1_key] = {}
        for doc_2_key in processed_article_hash.keys():
            # we have the nested for loops as one way to compare each document to each other document
            data_structure_for_cosine_similarity[doc_1_key][doc_2_key] = cosine_similarity(processed_article_hash[doc_1_key], processed_article_hash[doc_2_key])
    return data_structure_for_cosine_similarity

def create_heatmap(data_dict):
    temp = pd.DataFrame(data_dict)
    temp = temp.sort_index()[temp.sort_index().index]
    
    article_row = [str(each) for each in temp.index]
    article_col = [str(each) for each in temp.columns]

    data = temp.values
    fig, ax = plt.subplots(figsize=(12,12))
    im = ax.imshow(data)

    # We want to show all ticks...
    ax.set_xticks(np.arange(len(article_col)))
    ax.set_yticks(np.arange(len(article_row)))
    # ... and label them with the respective list entries
    ax.set_xticklabels(article_col)
    ax.set_yticklabels(article_row)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(article_row)):
        for j in range(len(article_col)):
            text = ax.text(j, i, round(data[i, j],2),
                           ha="center", va="center", color="w")

    ax.set_title("Similarity Heatmap")
    fig.tight_layout()
    plt.show()


def read_json(file):
    with open(file) as f:
        story = json.load(f)
    return story

In [4]:
movie_dict = {}
for file in os.listdir('../diag_jsons/'):
    filepath = '../diag_jsons/' + file
    movie = read_json(filepath)
    processed_data = script_cleaning(movie['dialogues'])
    data_structs_c = prep_data_structs(processed_data)
    temp = [np.mean(list(data_structs_c[k].values())) for k in data_structs_c.keys()]
    movie_dict[filepath] = np.mean(temp)



In [88]:
import pandas as pd
data = pd.read_csv('../feat_extraction/movies_with_feats_use_me.csv')

In [89]:
movie_df = pd.DataFrame({'filepath':list(movie_dict.keys()),'avg_sim_score':list(movie_dict.values())})

In [90]:
movie_df['filepath'] = movie_df['filepath'].str.split('/').str[-1].str.split('.').str[0].str.split('_').str[0]

In [91]:
movie_df.columns = ['Processed Title','avg_sim_score']

In [92]:
movie = data.merge(movie_df, on='Processed Title')

In [93]:
movie.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Processed Title', 'Success',
       'n_unique_words_char_1', 'n_unique_words_char_2',
       'n_unique_words_char_3', 'n_unique_words_char_4',
       'n_unique_words_char_5', 'FK_read_level_char_1', 'FK_read_level_char_2',
       'FK_read_level_char_3', 'FK_read_level_char_4', 'FK_read_level_char_5',
       'n_stop_words_char_1', 'n_stop_words_char_2', 'n_stop_words_char_3',
       'n_stop_words_char_4', 'n_stop_words_char_5', 'overall_polarity_char_1',
       'overall_polarity_char_2', 'overall_polarity_char_3',
       'overall_polarity_char_4', 'overall_polarity_char_5', 'tot_sents',
       'tot_pass_sents', 'passive_ratio', 'num_pass_sents_char_1',
       'num_pass_sents_char_2', 'num_pass_sents_char_3',
       'num_pass_sents_char_4', 'num_pass_sents_char_5', 'num_sents_char_1',
       'num_sents_char_2', 'num_sents_char_3', 'num_sents_char_4',
       'num_sents_char_5', 'polarity_of_mentions_char_1',
       'polarity_of_mentions_char_2', 'pola

In [94]:
success_data = pd.read_csv("../success_data.csv").drop('Unnamed: 0', axis=1) #, index_col=0)


In [95]:
def discretize(row):
    if row["Worldwide ROI (%)"] > 100:
        return 3
    elif 100 > row["Worldwide ROI (%)"] > 25:
        return 2
    elif 25 > row["Worldwide ROI (%)"] > -25:
        return 1
    else:
        return 0

In [96]:
success_data['target'] = success_data.apply(discretize,axis=1)

In [97]:
movie_df = movie.merge(success_data, on='Processed Title')

In [98]:
movie_df = movie_df.drop(['Unnamed: 0','Unnamed: 0.1'],axis=1)

In [166]:
filterCols = ['passive_ratio', 'pct_coref_sents', 'tot_unique_per_sent','tot_stop_per_sent', 'std_of_overall_polarity', 
              'wav_polarity', 'avg_FK', 'sign_check_char_mention_polairty', 'std_of_char_mention_polarity', 
              'Processed Title', 'avg_sim_score', 'Success']

In [167]:
df = movie_df[filterCols]
df = df.replace('?',np.NaN)
df = df.dropna(how='any')

X = df.drop('Success',axis=1)
y = df['Success']

In [168]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_id = X['Processed Title']
X = ss.fit_transform(X.drop('Processed Title',axis=1).values)

In [169]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

clfs = {'lr': LogisticRegression(random_state=0),
        'mlp': MLPClassifier(random_state=0),
        'dt': DecisionTreeClassifier(random_state=0),
        'rf': RandomForestClassifier(random_state=0),
        'svc': SVC(random_state=0)}

In [170]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe_clfs = {}

for name, clf in clfs.items():
    # Implement me
    pipe_clfs[name] = Pipeline([('StandardScaler',StandardScaler()),('clf',clf)])

In [171]:
param_grids = {}

In [172]:
C_range = [10 ** i for i in range(-4, 5)]

param_grid = [{'clf__multi_class': ['ovr'], 
               'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
               'clf__C': C_range},
              {'clf__multi_class': ['multinomial'],
               'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
               'clf__C': C_range}]

# Implement me
param_grids['lr'] = param_grid

In [173]:
param_grid = [{'clf__hidden_layer_sizes': [10, 100, 200],
               'clf__activation': ['identity', 'logistic', 'tanh', 'relu']}]

# Implement me
param_grids['mlp'] = param_grid

In [174]:
param_grid = [{'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]

# Implement me
param_grids['dt'] = param_grid

In [175]:
param_grid = [{'clf__n_estimators': [2, 10, 30],
               'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]

# Implement me
param_grids['rf'] = param_grid

In [176]:
param_grid = [{'clf__C': [0.01, 0.1, 1, 10, 100],
               'clf__gamma': [0.01, 0.1, 1, 10, 100],
               'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid']}]

# Implement me
param_grids['svc'] = param_grid

In [177]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# The list of [best_score_, best_params_, best_estimator_]
best_score_param_estimators = []

# For each classifier
for name in pipe_clfs.keys():
    # GridSearchCV
    # Implement me
    gs = GridSearchCV(estimator=pipe_clfs[name], param_grid=param_grids[name], scoring='accuracy', n_jobs=-1, cv=StratifiedKFold(
    n_splits=10, shuffle=True, random_state=10))
    
    # Fit the pipeline
    gs.fit(X, y)
    
    # Update best_score_param_estimators
    best_score_param_estimators.append([gs.best_score_, gs.best_params_, gs.best_estimator_])



In [178]:
best_score_param_estimators = sorted(best_score_param_estimators, key=lambda x : x[0], reverse=True)

# For each [best_score_, best_params_, best_estimator_]
for best_score_param_estimator in best_score_param_estimators:
    # Print out [best_score_, best_params_, best_estimator_], where best_estimator_ is a pipeline
    # Since we only print out the type of classifier of the pipeline
    print([best_score_param_estimator[0], best_score_param_estimator[1], type(best_score_param_estimator[2].named_steps['clf'])], end='\n\n')

[0.8333333333333334, {'clf__C': 0.0001, 'clf__multi_class': 'ovr', 'clf__solver': 'newton-cg'}, <class 'sklearn.linear_model.logistic.LogisticRegression'>]

[0.8333333333333334, {'clf__activation': 'identity', 'clf__hidden_layer_sizes': 10}, <class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>]

[0.8333333333333334, {'clf__min_samples_leaf': 30, 'clf__min_samples_split': 2}, <class 'sklearn.tree.tree.DecisionTreeClassifier'>]

[0.8333333333333334, {'clf__min_samples_leaf': 1, 'clf__min_samples_split': 30, 'clf__n_estimators': 30}, <class 'sklearn.ensemble.forest.RandomForestClassifier'>]

[0.8333333333333334, {'clf__C': 0.01, 'clf__gamma': 0.01, 'clf__kernel': 'linear'}, <class 'sklearn.svm.classes.SVC'>]



In [179]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [180]:
best_score_param_estimators[0][-1].predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1])

In [181]:
from imblearn.over_sampling import RandomOverSampler

In [183]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, test_size=0.3, random_state=0, stratify=y_ros)

In [184]:
X_ros, y_ros = ros.fit_sample(X, y)

In [185]:
X_ros.shape

(890, 10)

In [186]:
y_ros.shape

(890,)

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, test_size=0.3, random_state=0, stratify=y_ros)

In [188]:
best_score_param_estimators[0][-1].predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1])

In [190]:
df.to_csv('../playground/nlp_movie_data.csv',index=False)