In [1]:
import pandas as pd
import numpy as np

In [2]:
data_movies = pd.read_csv('../input/tmdb_5000_movies.csv')
data_credits = pd.read_csv('../input/tmdb_5000_credits.csv')
pd.set_option('display.max_columns',None)

In [3]:
data_movies.head(5)

In [None]:
from pandas.io.json import json_normalize
import json

In [None]:
def json_decode(data,key):
    result = []
    data = json.loads(data) #convert to jsonjsonn from string
    for item in data: #convert to list from json
        result.append(item[key])
    return result

In [None]:
data_movies.describe(include='all')

In [None]:
data_credits.describe(include='all')

## Clean missing values ##

In [None]:
def nan_clean(data,replace=False,alter=''):
    nan_count = len(data) - data.count()
    if np.logical_and(replace,nan_count > 0):
        data.fillna(alter,inplace=True)
        print('Replaced NaN with {}'.format(alter))
        print('Number of cleaned NANs:{}'.format(nan_count))
    else:
        print('Number of NANs:{}'.format(nan_count))
    return 

In [None]:
nan_clean(data_movies.homepage,replace=True)

In [None]:
nan_clean(data_movies.release_date,replace=True)

In [None]:
nan_clean(data_movies.overview,replace=True)

In [None]:
nan_clean(data_movies.runtime,replace=True,alter=0)

In [None]:
nan_clean(data_movies.tagline,replace=True)

## <span style="color:red"> **Questions** </span>  ##

### 1.What areas have the most influence on revenue? ###

+ Feature engineering: convert the nested jsons in **production_countries** column to list of countries

In [None]:
data_movies.production_countries = data_movies.production_countries.apply(json_decode,key='name')

In [None]:
data_movies.production_countries.head()

+ query the top 100 movies with highest **revenue** and their respective **production_countries** 

In [None]:
movie_top = data_movies.nlargest(100,'revenue')[['title','revenue','production_countries']]

In [None]:
movie_top

count the occurence of each country in the sorted list

In [None]:
from collections import defaultdict
import pprint
country_top = defaultdict(int)
for data in movie_top.production_countries:
    for item in data:
        country_top[item] += 1
pprint.pprint(country_top)

it can be seen that the *USA* plays the dominance role as it produces all of the top 100 movies from the list, following is *UK* with 19 movies and *New Zealand* with 6 movies

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.Series(dict(country_top),name='count')
df.index.name='country'
df.plot(kind='barh',grid=True,title='Occurences in the top-100 list')

to assess the quatitative value, I now calculate the average revenue produced by each country from the top 100 list

In [None]:
country_avg_rvn = defaultdict(int)
for index, row in movie_top.iterrows():
    for item in row['production_countries']:
        country_avg_rvn[item] += row['revenue']
for key in country_avg_rvn:
    country_avg_rvn[key] = country_avg_rvn[key]/country_top[key]
pprint.pprint(country_avg_rvn)

In [None]:
df = pd.Series(dict(country_avg_rvn),name='avg_rvn')
df.index.name='country'
df.plot(kind='barh',grid=True,title="Avg revenue per movie in the top-100 for each country")

Although taking part in producing all of the top 100 movies with highest revenues, the *USA* doesn't has their average revenue per movie at the top of the list. *Japan* only contributes to 1 movie from the top 100 list but it has very high revenue, making *Japan* top the list.

At this point, we still need further exploration to test the relevancy of *USA*. We now see to what extent  the *USA* contributes to the whole list's revenue by counting its occurence from the whole list:

In [None]:
count = 0
for item in data_movies.production_countries:
    if 'United States of America' in item:
        count +=1
print("The USA produces {0:.0f}% of the movies".format((count/len(data_movies.production_countries))*100))

Hence, it is hard to say that *USA* has the most influence on the revenue because *USA* takes into account most of the movies listed.

The cases of the *UK*, *New Zealand* and *Japan* worth further investigation as they have significance value on the two charts of the top 100 movies.

We now calculate the average revenue per movie for each country accross the original list. To be relevant, we now only consider countries that produce more than 10 movies to avoid the same case as Japan in the Top-100 list.

In [None]:
country_t = defaultdict(int)
for data in data_movies.production_countries:
    for item in data:
        country_t[item] += 1
country_top = dict()
for key in country_t:
    if country_t[key] > 10:
        country_top[key] = country_t[key]
print('List of production countries that produce more than 10 movies:')
pprint.pprint(country_top)

In [None]:
country_avg_rvn = defaultdict(int)
for index, row in data_movies.iterrows():
    for item in row['production_countries']:
        if item in list(country_top.keys()):
            country_avg_rvn[item] += row['revenue']
for key in country_avg_rvn:
    country_avg_rvn[key] = country_avg_rvn[key]/country_top[key]
pprint.pprint(dict(country_avg_rvn))

In [None]:
df = pd.Series(dict(country_avg_rvn),name='avg_rvn')
df.index.name='country'
df.plot(kind='bar',grid=True,title="Avg revenue per movie in original list for each country",figsize=(10,5))

*New Zealand* has its average revenue per movie significantly higher than the rest of the list. It is possible to say that *New Zealand* has the highest influence on the revenue.

### 2.How is a movie’s revenue and average score affected by its genre? ###

In [None]:
data_movies.genres = data_movies.genres.apply(json_decode,key='name')

In [None]:
data_movies.genres

### Genres and Revenue ###

prepare for ***multivariate linear regression***

In [None]:
genres = set()
for item in data_movies.genres:
    for genre in item:
        genres.add(genre)
genres = list(genres)
genres.append('revenue')
print(genres)

In [None]:
df = pd.DataFrame(columns=genres)

In [None]:
for index, row in data_movies.iterrows():
    for item in row['genres']:
        df.loc[index,item] = 1
    df.loc[index,'revenue'] = row['revenue']

In [None]:
df.fillna(0,inplace=True)
df.head()

Standardize the *revenue* for calculation

In [None]:
df.revenue = (df.revenue - df.revenue.mean())/df.revenue.std()

Preparing the matrixes for the model:
<div style="text-align:center"> **Y =  w0 + w1X1 + w2X2 + ...** </div>

In [None]:
Y = np.array(df.revenue)
Y

In [None]:
x1 = np.ones(len(df)).reshape(len(df),1)
x2 = df.iloc[:,:-1].as_matrix()
X = np.concatenate((x1,x2),axis=1)
X

The resulting coefficients matrix of the formula:
<div style="text-align:center"> **W = inv(X'X)X'Y** </div>
<div> are the weights determining the impact of its respective genre because the revenue is standardized</div>

In [None]:
W = np.linalg.pinv(np.transpose(X).dot(X)).dot(np.transpose(X)).dot(Y)
pprint.pprint(list(W))

Labeling the resulting weights give:
<div>Noted that W0 is constant thus omitted</div>

In [None]:
genres_d = genres[:-1]
weights = dict(zip(genres_d, W[1:]))
weights

In [None]:
df = pd.Series(weights,name='genre_rvn_weight')
df.index.name='genre'
df.plot(kind='bar',grid=True,title="Weights of genres on the revenue",figsize=(15,5))

The plot clearly shows the impacts of genres on the revenue. <span style='color:red'>*Adventure*</span>, <span style='color:red'>*Animation*</span> and <span style='color:red'>*Fantasy*</span> genres have the outstanding impact that lead to the high value in revenue.

On the other hand,*Comedy*, <span style='color:red'>*Documentary*</span>, *Foreign*, *Horror*, *Western* and *TV Movie* have negative impact on the revenue.

### Genres and Average Score ###

Applying ***multivariate linear regression*** as above

In [None]:
del genres[-1]
genres.append('avg_vote')

In [None]:
genres

In [None]:
df = pd.DataFrame(columns=genres)

In [None]:
for index, row in data_movies.iterrows():
    for item in row['genres']:
        df.loc[index,item] = 1
    df.loc[index,'avg_vote'] = row['vote_average']

In [None]:
df.fillna(0,inplace=True)
df.head()

Standardize the *avg_vote* for calculation

In [None]:
df.avg_vote = (df.avg_vote - df.avg_vote.mean())/df.avg_vote.std()

In [None]:
Y = np.array(df.avg_vote)
Y

In [None]:
W = np.linalg.pinv(np.transpose(X).dot(X)).dot(np.transpose(X)).dot(Y)
pprint.pprint(list(W))

In [None]:
genres_d = genres[:-1]
weights = dict(zip(genres_d, W[1:]))
weights

In [None]:
df = pd.Series(weights,name='genre_avgvote_weight')
df.index.name='genre'
df.plot(kind='bar',grid=True,title="Weights of genres on the avg_vote",figsize=(15,5))

*TV Movies* has a significant negative impact on the avg_vote. Most genres have good impact, the highest include: *Animation*, *Documentary*, *Drama* and *War*.

### 3.What influence does release date have on revenue? ###

In [None]:
import datetime

In [None]:
string_date = data_movies.release_date[0]
datetime.datetime.strptime(string_date,"%Y-%m-%d").isoweekday()

First guess: assesing the release_dates as week days and their influence on the revenue

convert the date to week days:

In [None]:
weekdays = {
    1 : 'Mon',
    2 : 'Tue',
    3 : 'Wed',
    4 : 'Thu',
    5 : 'Fri',
    6 : 'Sat',
    7 : 'Sun'
}
def to_weekday(string_date):
    if string_date != '':
        weekday = datetime.datetime.strptime(string_date,"%Y-%m-%d").isoweekday()
        return weekdays[weekday]
    else:
        return np.nan

In [None]:
data_movies['release_weekday'] = data_movies.release_date.apply(to_weekday)

In [None]:
data_movies.release_weekday.value_counts(dropna=False)

create new df containing weekdays and  revenue:

In [None]:
df = data_movies[pd.notnull(data_movies['release_weekday'])] #omit the null weekdays
df = df.loc[:,['revenue','release_weekday']] 
df = df[df['revenue']!=0] #omit the zero revenues

In [None]:
df

It is plausible to perform ANOVA on weekday categorized values to see if week days have any influence on the revenue

<div> **Null Hypothesis**: Weekdays have the same influence on revenue</div>
<div> **Alternative Hypothesis**: There are difference influence base on week days </div>

Categorize revenue by weekdays and randomly pick 30 entries from each category:

In [None]:
#reset index to join to dataframe
mon = df[df['release_weekday']=='Mon'].sample(100).reset_index()
tue = df[df['release_weekday']=='Tue'].sample(100).reset_index()
wed = df[df['release_weekday']=='Wed'].sample(100).reset_index()
thu = df[df['release_weekday']=='Thu'].sample(100).reset_index()
fri = df[df['release_weekday']=='Fri'].sample(100).reset_index()
sat = df[df['release_weekday']=='Sat'].sample(100).reset_index()
sun = df[df['release_weekday']=='Sun'].sample(100).reset_index()

In [None]:
df = pd.DataFrame({
    'Mon':mon['revenue'],
    'Tue':tue['revenue'],
    'Wed':wed['revenue'],
    'Thu':thu['revenue'],
    'Fri':fri['revenue'],
    'Sat':sat['revenue'],
    'Sun':sun['revenue']
})

In [None]:
df

In [None]:
import scipy.stats as stats

In [None]:
F,p = stats.f_oneway(
    df['Mon'],
    df['Tue'],
    df['Wed'],
    df['Thu'],
    df['Fri'],
    df['Sat'],
    df['Sun']
)

In [None]:
F,p

for the p-value of 0.4 > 0.05, we cannot reject the null hypothesis that weekdays have same effect on revenue

This time we might want to perform ANOVA again on a different category method: weekends :['Fri', 'Sat', 'Sun'] and the rest ['Mon','Tue','Wed','Thu']

<div> **Null Hypothesis**: Weekdays have the same influence on revenue</div>
<div> **Alternative Hypothesis**: There are difference influence between weekends and other days </div>

In [None]:
weekend = pd.concat([fri, sat, sun]).sample(30).reset_index()
other = pd.concat([mon, tue, wed, thu]).sample(30).reset_index()

In [None]:
weekend

In [None]:
other

In [None]:
df = pd.DataFrame({
    'Weekend':weekend['revenue'],
    'Other':other['revenue']
})

In [None]:
df

In [None]:
F,p = stats.f_oneway(
    df['Other'],
    df['Weekend'],
)

In [None]:
F, p

We still cannot conclude on the impact of weekend on the revenue due to high p-value

How about the month? Would the month in release date affect the revenue?
<div> **Null Hypothesis**:Months have the same influence on revenue</div>
<div> **Alternative Hypothesis**: There are difference influence betwee nmonths </div>

In [None]:
df = data_movies[pd.notnull(data_movies['release_date'])] #omit the null weekdays
df = df.loc[:,['revenue','release_date']] 
df = df[df['revenue']!=0] #omit the zero revenues
df['release_date'] = pd.to_datetime(df.release_date)

def to_month(date):
    return date.month

df['release_date'] = df.release_date.apply(to_month)

jan = df[df['release_date'] ==1].sample(30).reset_index()
feb = df[df['release_date'] ==2].sample(30).reset_index()
mar = df[df['release_date'] ==3].sample(30).reset_index()
apr = df[df['release_date'] ==4].sample(30).reset_index()
may = df[df['release_date'] ==5].sample(30).reset_index()
jun = df[df['release_date'] ==6].sample(30).reset_index()
jul = df[df['release_date'] ==7].sample(30).reset_index()
aug = df[df['release_date'] ==8].sample(30).reset_index()
sep = df[df['release_date'] ==9].sample(30).reset_index()
oct_ = df[df['release_date'] ==10].sample(30).reset_index()
nov = df[df['release_date'] ==11].sample(30).reset_index()
dec = df[df['release_date']==12].sample(30).reset_index()

df = pd.DataFrame({
    'Jan':jan['revenue'],
    'Feb':feb['revenue'],
    'Mar':mar['revenue'],
    'Apr':apr['revenue'],
    'May':may['revenue'],
    'Jun':jun['revenue'],
    'Jul':jul['revenue'],
    'Aug':aug['revenue'],
    'Sep':sep['revenue'],
    'Oct':oct_['revenue'],
    'Nov':nov['revenue'],
    'Dec':dec['revenue']
})

F,p = stats.f_oneway(
    df['Jan'],
    df['Feb'],
    df['Mar'],
    df['Apr'],
    df['May'],
    df['Jun'],
    df['Jul'],
    df['Aug'],
    df['Sep'],
    df['Oct'],
    df['Nov'],
    df['Dec']
)

F,p

with such small p-value, we can reject the Null Hypothesis, suggesting that there is differnces among the release month on the revenue.

For a simple approach, I want to calculate the avg revenue among the months

In [None]:
df = data_movies[pd.notnull(data_movies['release_date'])] #omit the null weekdays
df = df.loc[:,['revenue','release_date']] 
df = df[df['revenue']!=0] #omit the zero revenues
df['release_date'] = pd.to_datetime(df.release_date)

def to_month(date):
    return date.month

df['release_date'] = df.release_date.apply(to_month)

jan = df[df['release_date'] ==1].revenue.mean()
feb = df[df['release_date'] ==2].revenue.mean()
mar = df[df['release_date'] ==3].revenue.mean()
apr = df[df['release_date'] ==4].revenue.mean()
may = df[df['release_date'] ==5].revenue.mean()
jun = df[df['release_date'] ==6].revenue.mean()
jul = df[df['release_date'] ==7].revenue.mean()
aug = df[df['release_date'] ==8].revenue.mean()
sep = df[df['release_date'] ==9].revenue.mean()
oct_ = df[df['release_date'] ==10].revenue.mean()
nov = df[df['release_date'] ==11].revenue.mean()
dec = df[df['release_date']==12].revenue.mean()

In [None]:
month_avg_rvn = {
    1:jan,
    2:feb,
    3:mar,
    4:apr,
    5:may,
    6:jun,
    7:jul,
    8:aug,
    9:sep,
    10:oct_,
    11:nov,
    12:dec
}
df = pd.Series(dict(month_avg_rvn),name='avg_rvn')
df.index.name='month'
df.plot(kind='bar',grid=True,title="Avg revenue per movie in original list for each month",figsize=(10,5))

As the plot says, movies released in June and May have the highest average revenue while on January, Septemper and August the average revenue are low

At this point, I propose another guess, how does the revenue vary over the course of time? **Would movie released late get higher revenue?**
<div> I now sort the revenue base on its chronological release date and drop the zero-revenue rows </div>

In [None]:
df = data_movies.loc[:,['revenue','release_date']]
df['release_date'] = pd.to_datetime(df.release_date)
df = df.sort_values('release_date')
df = df[df['revenue']!=0]
df = df.set_index('release_date')
df

In [None]:
df.plot(grid=True,figsize=(20,10,),title='Revenue over the period').set_ylabel('Revenue')

Our guess is right and we can conclude base on the plot that the revenue gets higher and higher over time.