In [1]:
import gc 
import os
import time
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from contextlib import contextmanager
from tqdm import tqdm

import plotly
import cufflinks as cf
import plotly.offline as offline
import plotly.graph_objs as go

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout
import plotly.express as px

warnings.simplefilter("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline
init_notebook_mode(connected=True)
cf.set_config_file(offline=True)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

def selectGenre(df, genre='Drama'):
    return df[df[genre] == 1]

data = pd.read_csv("../input/movie-poster-analysis/movies_analysis_extended_genre.csv")
genre_list = list(data.columns[-27:])
data['dummy_count'] = 1
data.head(1)

Unnamed: 0,file,label,score,imdbId,count,Imdb Link,Title,IMDB Score,Genre,Poster,imdb_title_id_x,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics,imdb_title_id_y,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,votes_8,votes_7,votes_6,votes_5,votes_4,votes_3,votes_2,votes_1,allgenders_0age_avg_vote,allgenders_0age_votes,allgenders_18age_avg_vote,allgenders_18age_votes,allgenders_30age_avg_vote,allgenders_30age_votes,allgenders_45age_avg_vote,allgenders_45age_votes,males_allages_avg_vote,males_allages_votes,males_0age_avg_vote,males_0age_votes,males_18age_avg_vote,males_18age_votes,males_30age_avg_vote,males_30age_votes,males_45age_avg_vote,males_45age_votes,females_allages_avg_vote,females_allages_votes,females_0age_avg_vote,females_0age_votes,females_18age_avg_vote,females_18age_votes,females_30age_avg_vote,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes,usa_gross_income_converted,worlwide_gross_income_converted,Mystery,Biography,War,History,Documentary,Sport,Talk-Show,News,Adventure,Action,Sci-Fi,nan,Short,Horror,Musical,Comedy,Music,Family,Romance,Fantasy,Western,Animation,Reality-TV,Drama,Crime,Film-Noir,Thriller,dummy_count
0,100014_0.jpg,M,0.8,100014,0,http://www.imdb.com/title/tt100014,Liu jai yim taam (1987),6.0,['Fantasy'],https://images-na.ssl-images-amazon.com/images...,tt0100014,Liu jai yim taam,Liu jai yim taam,1990.0,1990-05-19,"Fantasy, Horror",90.0,Hong Kong,Cantonese,Ngai Choi Lam,Kwan Tsang,Diagonal Pictures,"Amy Yip, Chi Jan Ha, So Man, Hitomi Kudô, Wai-...","Three vixens have meditated for 1,000 years to...",5.7,575.0,,,,,11.0,9.0,tt0100014,5.7,575.0,6.2,6.0,58.0,34.0,78.0,70.0,148.0,75.0,45.0,32.0,12.0,23.0,4.0,1.0,5.3,41.0,5.6,201.0,5.9,154.0,5.7,382.0,,,5.5,35.0,5.7,192.0,5.9,143.0,5.2,26.0,4.0,1.0,3.5,6.0,4.9,8.0,6.4,11.0,5.2,30.0,6.1,85.0,5.6,295.0,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1


### Demographics of Movie's Earning with M/F on Posters

In [2]:
top_count = 300
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
fig = px.scatter(tf, x="year", y="IMDB Score", color="label",
                 size='usa_gross_income', hover_data=['Title'], opacity=.9,)
fig.update_layout(
    title=f"Top {top_count} movies USA Box Office Returns",
    xaxis_title="Released Year",
    yaxis_title="IMDB Scores",
)
fig.show()

In [3]:
top_count = 1000
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
fig = px.histogram(tf, x="year", y="usa_gross_income_converted", color="label",
                   marginal="box", # or violin, rug
                   hover_data=tf.columns,
                  color_discrete_sequence=['#ff2e63','#08d9d6'])
fig.update_layout(
    title=f"IMDB Scores Of Top {top_count} movies USA Box Office Returns",
    xaxis_title="IMDB Scores",
    yaxis_title="Gross Income in USD",
)
fig.show()

In [4]:
top_count = 1000
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
fig = px.scatter(tf, x="production_company", y="IMDB Score", color="label",
                 hover_data=['Title'],
#                  color_discrete_sequence=['#ff2e63','#08d9d6']
                 marginal_y='rug'
                )
fig.update_layout(
    title=f"Biased production companies with more Males on Poster",
    xaxis_title="Released Year",
    yaxis_title="IMDB Scores",
)
fig.show()

In [5]:
top_count = 1000
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
fig = px.scatter(tf, x="production_company", y="usa_gross_income", color="label",
                  hover_data=['Title'],
                 marginal_y="violin", 
                 marginal_x="box",
#                 color_discrete_sequence=['#ff2e63','#08d9d6']
                )

fig.update_layout(
    title=f"Gross Earnings in USD",
    xaxis_title="Released Year",
    yaxis_title="IMDB Scores",
)
fig.show()

In [6]:
top_count = 500
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
fig = px.scatter(tf, x="year", y="production_company", 
                 color="label", 
                 size='IMDB Score',
                 size_max=7,
                 opacity=1,
                 hover_data=['Title'],
                 marginal_y="box", 
#                  marginal_x="histogram"
                 color_discrete_sequence=['#f6416c','#00b8a9']
                )
fig.update_layout(
    xaxis_title="Released Year",
    yaxis_title="Production Company",
)
fig.show()

In [7]:
top_count = 100
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
fig = px.scatter(tf, x="usa_gross_income", y="writer", color="label",
                 hover_data=['Title'],
                 marginal_y="violin",
                 color_discrete_sequence=['#f6416c','#00b8a9']

)
fig.update_layout(
#     xaxis_title="Released Year",
#     yaxis_title="Production Company",
)
fig.show()

In [8]:
top_count = 1000
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
fig = px.scatter(tf, x="year", y="writer", color="label",
                  hover_data=['Title'],
                  marginal_y="box",
                  marginal_x="histogram", 
                  color_discrete_sequence=['#f6416c','#00b8a9']

                )
fig.update_layout(
#     xaxis_title="Released Year",
#     yaxis_title="Production Company",
)
fig.show()

In [9]:
top_count = 1000
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
fig = px.scatter(tf, x="usa_gross_income", y="director", color="label",
                  hover_data=['Title'], trendline=True, 
                 title='Posters with Male/Female posters at Profits',
                 marginal_y="violin", opacity=.8,
#                  color_discrete_sequence=['#f35588','#46b5d1'],
                 color_continuous_midpoint=2
)
fig.update_layout(
#     xaxis_title="Released Year",
#     yaxis_title="Production Company",
)
fig.show()

In [10]:
top_count = 1000
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
fig = px.scatter(tf, x="IMDB Score", y="director", color="label", 
             title='IMDGB with Male/Female posters at Profits',
                hover_data=['Title'],symbol='label',
                marginal_y="box",
                 color_discrete_sequence=['#e61c5d','#00b8a9']
)
fig.update_layout(
    xaxis_title="IMDB Score",
    yaxis_title="Director",
)
fig.show()

In [11]:
top_count = 1000
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
tf_parallel = tf
tf_parallel = tf_parallel.sort_values(by='label')
tf_parallel['label'] = (tf_parallel['label'] == "M").astype(int)

In [12]:
information_cols = ['label','males_18age_avg_vote','males_45age_avg_vote',
                    'females_45age_avg_vote','females_18age_avg_vote','IMDB Score']
color_scale = px.colors.diverging.Temps
fig = px.parallel_coordinates(tf_parallel, color="label",
                              dimensions=information_cols, 
                              labels= {'label' : "Gender"},
                              color_continuous_scale=color_scale,
                              color_continuous_midpoint=1.5)
fig.show()

In [13]:
top_count = 3000
tf = data.dropna(subset=['usa_gross_income'])
tf = tf.sort_values(by='usa_gross_income', ascending=False).head(top_count)
tf_parallel = tf
tf_parallel = tf_parallel.sort_values(by='label')
tf_parallel['label'] = (tf_parallel['label'] == "M").astype(int)
information_cols = ['year','budget','usa_gross_income','label']
color_scale = px.colors.diverging.Temps
fig = px.parallel_coordinates(tf_parallel, color="label",
                              dimensions=information_cols, 
                              labels= {'label' : "Gender"},
                              color_continuous_scale=color_scale,
                              color_continuous_midpoint=1.5)
fig.show()

In [14]:
from wordcloud import WordCloud, STOPWORDS
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [15]:
male_des = data[data["label"]=='M'].dropna(subset=['description'])
female_des = data[data["label"]=='F'].dropna(subset=['description'])

In [16]:
female_des['description']

1        Two women, black and white, in 1955 Montgomery...
2        Two women, black and white, in 1955 Montgomery...
3        Thousands of kilometers apart, three young mig...
4        Thousands of kilometers apart, three young mig...
14       Vampish miss Dolan hires hardboiled P.I. Harry...
                               ...                        
22278    James Porter (Martin Lawrence) drives his teen...
22279    The later years of Quentin Crisp's life in New...
22285    The quirky story of Vera and Axl who both live...
22317    A look at the inspiration behind Thomas Kinkad...
22318    Los Angeles screenwriter David Sumner relocate...
Name: description, Length: 3742, dtype: object

In [17]:
male_des = male_des.drop_duplicates()
female_des = female_des.drop_duplicates()

In [18]:
from collections import defaultdict

## custom function for ngram generation ##
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

## custom function for horizontal bar chart ##
def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"].values[::-1],
        x=df["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

## Get the bar chart from sincere questions ##
freq_dict = defaultdict(int)
for sent in female_des["title"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(40), '#ff05e2')

## Get the bar chart from insincere questions ##
freq_dict = defaultdict(int)
for sent in male_des["title"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(40), '#35477d')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of female character in title"
                                          "Frequent words of male characters title", 
                                          ])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=900,paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots')

#plt.figure(figsize=(10,16))
#sns.barplot(x="ngram_count", y="ngram", data=fd_sorted.loc[:50,:], color="b")
#plt.title("Frequent words for Insincere Questions", fontsize=16)
#plt.show()

In [19]:
from collections import defaultdict

## custom function for ngram generation ##
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

## custom function for horizontal bar chart ##
def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"].values[::-1],
        x=df["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

## Get the bar chart from sincere questions ##
freq_dict = defaultdict(int)
for sent in female_des["description"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(100), '#ff05e2')

## Get the bar chart from insincere questions ##
freq_dict = defaultdict(int)
for sent in male_des["description"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(100), '#35477d')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of female characters description", 
                                          "Frequent words of male character description"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=900,paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
fig.show()
#plt.figure(figsize=(10,16))
#sns.barplot(x="ngram_count", y="ngram", data=fd_sorted.loc[:50,:], color="b")
#plt.title("Frequent words for Insincere Questions", fontsize=16)
#plt.show()

## Model Special Words For M/F

In [20]:
import os
import json
import string
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

In [21]:
# Get the tfidf vectors #
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vec.fit_transform(data['description'].fillna('nan').values.tolist())
train_tfidf = tfidf_vec.transform(data['description'].fillna('nan').values.tolist())
data['target'] = (data['label'] == "M").astype(int)

train_y = data["target"].values

def runModel(train_X, train_y, test_X, test_y, test_X2):
    model = linear_model.LogisticRegression(C=5., solver='sag')
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)[:,1]
    pred_test_y2 = model.predict_proba(test_X2)[:,1]
    return pred_test_y, pred_test_y2, model

print(">> Building model.")
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([data.shape[0]])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(data):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runModel(dev_X, dev_y, val_X, val_y, train_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
for thresh in np.arange(0.1, 0.201, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

>> Building model.
F1 score at threshold 0.1 is 0.6939433934594516
F1 score at threshold 0.11 is 0.6938655708834752
F1 score at threshold 0.12 is 0.6939794419970632
F1 score at threshold 0.13 is 0.6949102677258018
F1 score at threshold 0.14 is 0.6948453608247424
F1 score at threshold 0.15 is 0.6947150870977267
F1 score at threshold 0.16 is 0.6950606329488317
F1 score at threshold 0.17 is 0.6952268010672991
F1 score at threshold 0.18 is 0.6951998811116065
F1 score at threshold 0.19 is 0.6945644080416978
F1 score at threshold 0.2 is 0.6950164130110416


In [22]:
import eli5
eli5.show_weights(model, vec=tfidf_vec, top=50, 
#                   feature_filter=lambda x: x != '<BIAS>',
                  highlight_spaces=True,
                 target_names={1:'Male', 0: 'Female'},
              horizontal_layout  =False
                 )


The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).



Using TensorFlow backend.


Weight?,Feature
+2.396,men
+2.033,gay
+1.781,army
+1.761,corruption
+1.715,irish
+1.679,drug
+1.640,help
+1.635,jj
+1.610,christmas
+1.585,try


In [23]:
eli5.show_weights(model, vec=tfidf_vec, top=10, 
#                   feature_filter=lambda x: x != '<BIAS>',
                  highlight_spaces=True)

Weight?,Feature
+2.396,men
+2.033,gay
… 55268 more positive …,… 55268 more positive …
… 53634 more negative …,… 53634 more negative …
-1.971,popular
-2.001,girls
-2.060,accident
-2.077,woman
-2.093,husband
-2.209,love
