In [50]:
import pandas as pd
import numpy as np
from scipy import stats

In [3]:
df = pd.read_csv("added_features.csv")
df.head()

Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description,Ids,TMDB_Id,...,Horror,Music,Mystery,Romance,SciFi,TV,Thriller,War,Western,Bad Words
0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts...","(278, [18, 80])",278.0,...,0,0,0,0,0,0,0,0,0,100
1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec...","(238, [18, 80])",238.0,...,0,0,0,0,0,0,0,0,0,20
2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...,"(155, [18, 28, 80, 53])",155.0,...,0,0,0,0,0,0,1,0,0,5
3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,...","(424, [18, 36, 10752])",424.0,...,0,0,0,0,0,0,0,1,0,16
4,12 Angry Men,1957,96,9.0,97.0,4.36,824211,The jury in a New York City murder trial is fr...,"(389, [18])",389.0,...,0,0,0,0,0,0,0,0,0,1


# Hypothesis 1 - Critic Ratings are usually lower than fan ratings

In [11]:
critic_mean = df['Metascore of movie'].mean()
critic_stdev = df['Metascore of movie'].std()
fan_mean = df['Movie Rating'].mean() * 10
fan_stdev = df['Movie Rating'].std() * 10

print("Critic mean: ", critic_mean, "Critic stdev: ", critic_stdev)
print("Fan mean: ", fan_mean, "Fan stdev: ", fan_stdev)

Critic mean:  68.00241642122968 Critic stdev:  27.974361909313053
Fan mean:  79.702 Fan stdev:  2.757317319206993


In [64]:
t, p = stats.ttest_ind(df['Metascore of movie'], df['Movie Rating'], alternative="greater")
print(t, p)

67.85831846337442 0.0


79.702

# Hypothesis 2 - Critics favor certain genres

In [12]:
genre_keys = {

    28       :   "Action",
    12       :   "Adventure",
    16       :   "Animation",
    35       :   "Comedy",
    80       :   "Crime",
    99       :   "Documentary",
    18       :   "Drama",
    10751    :   "Family",
    14       :   "Fantasy",
    36       :   "History",
    27       :   "Horror",
    10402    :   "Music",
    9648     :   "Mystery",
    10749    :   "Romance",
    878      :   "SciFi",
    10770    :   "TV",
    53       :   "Thriller",
    10752    :   "War",
    37       :   "Western"
    
}

In [16]:
for genre in genre_keys.values():
    genre_mean = df.loc[df[genre] == 1]['Metascore of movie'].mean()
    
    print("{:15s}: {}".format(genre, genre_mean))

Action         : 62.069664522361286
Adventure      : 70.48083981842794
Animation      : 76.77445451591942
Comedy         : 62.981939813459654
Crime          : 64.45607283613715
Documentary    : 75.25
Drama          : 67.30530639010786
Family         : 72.87650831696484
Fantasy        : 70.2032755226972
History        : 66.76720651705512
Horror         : 72.72586872586874
Music          : 73.80740740740741
Mystery        : 66.45497970066275
Romance        : 70.27052153446557
SciFi          : 72.2146003470214
TV             : 70.0
Thriller       : 66.10631566568253
War            : 71.15558796251456
Western        : 80.96


In [81]:
f, p = stats.f_oneway(  df.loc[df['Action'] == 1]['Metascore of movie'],
                        df.loc[df['Comedy'] == 1]['Metascore of movie'],
                        df.loc[df['Drama'] == 1]['Metascore of movie'])
print(f, p)

3.3742665128067104 0.03460766537601791


# Hypothesis 3 - Movies have more swears now than they used to

In [18]:
df['Bad Words'].mean()

29.392

In [19]:
df['Bad Words'].std()

60.316236490553344

In [82]:
df_old = df.sort_values(by='Year of Release').head(50)
df_old['Bad Words'].std()

8.145337967957177

In [83]:
df_new = df.sort_values(by='Year of Release').tail(50)
df_new['Bad Words'].std()

54.27316020251028

In [76]:
t, p, = stats.ttest_ind(df_old['Bad Words'], df_new['Bad Words'], alternative = 'less')
print(t, p)

-3.9065402778430207 8.611936974058963e-05


# Hypothesis 4 - Dogs and Swear Words

In [46]:
df_dogs = df.loc[df['Description'].str.contains('dog', case=False)]
df_dogs

Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description,Ids,TMDB_Id,...,Horror,Music,Mystery,Romance,SciFi,TV,Thriller,War,Western,Bad Words
205,Umberto D.,1952,89,8.2,92.0,0.07,27087,An elderly man and his dog struggle to survive...,"(833, [18])",833.0,...,0,0,0,0,0,0,0,0,0,3
235,The Wizard of Oz,1939,102,8.1,92.0,2.08,414083,Young Dorothy Gale and her dog Toto are swept ...,"(630, [12, 14, 10751])",630.0,...,0,0,0,0,0,0,0,0,0,0
263,Hachi: A Dog's Tale,2009,93,8.1,7.892,236.0,298283,A college professor bonds with an abandoned do...,"(28178, [18, 10751])",28178.0,...,0,0,0,0,0,0,0,0,0,0
540,Togo,2019,113,7.9,69.0,,52579,"The story of Togo, the sled dog who led the 19...","(884363, [80, 18])",884363.0,...,0,0,0,0,0,0,0,0,0,1
651,Isle of Dogs,2018,101,7.8,82.0,32.02,181265,"Set in Japan, Isle of Dogs follows a boy's ody...","(399174, [12, 35, 16])",399174.0,...,0,0,0,0,0,0,0,0,0,4


In [47]:
df_dogs = df_dogs[df['Movie Name'] != "The Wizard of Oz"]
df_dogs

  df_dogs = df_dogs[df['Movie Name'] != "The Wizard of Oz"]


Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description,Ids,TMDB_Id,...,Horror,Music,Mystery,Romance,SciFi,TV,Thriller,War,Western,Bad Words
205,Umberto D.,1952,89,8.2,92.0,0.07,27087,An elderly man and his dog struggle to survive...,"(833, [18])",833.0,...,0,0,0,0,0,0,0,0,0,3
263,Hachi: A Dog's Tale,2009,93,8.1,7.892,236.0,298283,A college professor bonds with an abandoned do...,"(28178, [18, 10751])",28178.0,...,0,0,0,0,0,0,0,0,0,0
540,Togo,2019,113,7.9,69.0,,52579,"The story of Togo, the sled dog who led the 19...","(884363, [80, 18])",884363.0,...,0,0,0,0,0,0,0,0,0,1
651,Isle of Dogs,2018,101,7.8,82.0,32.02,181265,"Set in Japan, Isle of Dogs follows a boy's ody...","(399174, [12, 35, 16])",399174.0,...,0,0,0,0,0,0,0,0,0,4


In [86]:
df_dogs['Bad Words'].std()

1.8257418583505538

In [85]:
df['Bad Words'].std()

60.316236490553344

In [79]:
t, p = stats.ttest_ind(df_dogs['Bad Words'], df['Bad Words'], alternative='less')
print(t, p)

-0.9078271483919631 0.1820939160601261
