# Movie Rating Hypothesis testing



In this assignment we are going prove of disprove a hypothesis

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from IPython.display import HTML, display

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

In [2]:
%matplotlib inline

##### Read and cleanup the movie data

In [3]:
def get_movie_data():
    
    unames = ['user_id','gender','age','occupation','zip']
    users = pd.read_table(os.path.join('../data','users.dat'), 
                          sep='::', header=None, names=unames, encoding='latin-1')
    
    rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = pd.read_table(os.path.join('../data', 'ratings.dat'), 
                            sep='::', header=None, names=rnames, encoding='latin-1')
    
    mnames = ['movie_id', 'title','genres']
    movies = pd.read_table(os.path.join('../data', 'movies.dat'), 
                           sep='::', header=None, names=mnames, encoding='latin-1')

    return users, ratings, movies

users, ratings, movies = get_movie_data()

tmp = movies.title.str.extract('(.*) \(([0-9]+)\)')
tmp.apply(lambda x:x[0] if len(x) > 0 else None)
tmp.apply(lambda x: x[0][:40] if len(x) > 0 else None)

movies['year'] = tmp[1]
movies['short_title'] = tmp[0]

print(movies.head())

  users = pd.read_table(os.path.join('../data','users.dat'),
  ratings = pd.read_table(os.path.join('../data', 'ratings.dat'),
  movies = pd.read_table(os.path.join('../data', 'movies.dat'),


   movie_id                               title                        genres  \
0         1                    Toy Story (1995)   Animation|Children's|Comedy   
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy   
2         3             Grumpier Old Men (1995)                Comedy|Romance   
3         4            Waiting to Exhale (1995)                  Comedy|Drama   
4         5  Father of the Bride Part II (1995)                        Comedy   

   year                  short_title  
0  1995                    Toy Story  
1  1995                      Jumanji  
2  1995             Grumpier Old Men  
3  1995            Waiting to Exhale  
4  1995  Father of the Bride Part II  


### 1. Prove or disprove the following Null Hypothesis:

#### H0: Different age segments of users rate the different genres of movies similarly

- Age segments are (0-20), (21-30), (31-50), (50 and above),
- Genres - Drama, Comedy, Action, Romance, Adventure
- Rating 4 or above is a positive, else negative (used for binary votes)

#### Solution

##### 1) join movies and ratings table based on movie_id
##### 2) join result of above join with users table on user_id
##### 3) assign group name to unique ages in the database
##### 4) extact the dominant or the genre that a movie is most known for
##### 5) calculate mean rating for each movie and age group

In [4]:
# 1) join movies and ratings table based on movie_id
movies_ratings = pd.merge(movies,ratings,on = 'movie_id', how = 'inner')

#2) join result of above join with users table on user_id
df = pd.merge(movies_ratings,users,on = 'user_id', how = 'inner')

#3) assign group name to unique ages in the database
df['age_group'] = df.age.replace([ 1, 50, 25, 35, 18, 45, 56],[ 'G1', 'G4', 'G2', 'G3', 'G1', 'G3', 'G4'])

#4) extact the dominant or the genre that a movie is most known for
df['broad_genres'] =  [item.split('|')[0] for item in df['genres']]
df = df[df['broad_genres'].isin(['Adventure','Comedy','Action','Drama','Romance'])]

#5) calculate mean rating for each movie and age group
data = df.groupby(['movie_id','title','broad_genres','age_group']).mean()['rating'].reset_index()
data

       movie_id                    title broad_genres age_group    rating
0             2           Jumanji (1995)    Adventure        G1  3.068421
1             2           Jumanji (1995)    Adventure        G2  3.145594
2             2           Jumanji (1995)    Adventure        G3  3.353846
3             2           Jumanji (1995)    Adventure        G4  3.381818
4             3  Grumpier Old Men (1995)       Comedy        G1  3.144000
...         ...                      ...          ...       ...       ...
10293      3951  Two Family House (2000)        Drama        G4  4.142857
10294      3952    Contender, The (2000)        Drama        G1  3.549296
10295      3952    Contender, The (2000)        Drama        G2  3.760274
10296      3952    Contender, The (2000)        Drama        G3  3.782178
10297      3952    Contender, The (2000)        Drama        G4  4.057143

[10298 rows x 5 columns]

##### Filter the database for movie genre (select a part of movie data that is specefic to one genre)
##### For each genre, compare how users that belong to different age groups rate movies from a specific genre

In [5]:

AGE1 = []
AGE2 = []
Genre = []
p_value = []
for genre in data.broad_genres.unique():
    tmp = data[data.broad_genres == genre]
    for age1 in tmp['age_group'].unique():
        a = tmp[tmp['age_group'] == age1]
        for age2 in tmp['age_group'].unique():
            b = tmp[tmp['age_group'] == age2]
            AGE1.append(age1)
            AGE2.append(age2)
            Genre.append(genre)
            p_value.append(stats.ttest_ind(a.rating,
                b.rating)[1])


##### Store results from above analysis in a data frame
##### Define an outcome column that summarizes the result of test based on p-value

In [6]:

Results = pd.DataFrame()
Results['AGE GROUP A'] = AGE1
Results['AGE GROUP B'] = AGE2
Results['GENRE'] = Genre
Results['p_value'] = p_value
Results['Outcome'] = ['Reject the null hypothesis' if item<0.05 else 'Fail to reject the null hypothesis' for item in Results['p_value']]
Results.head()

  AGE GROUP A AGE GROUP B      GENRE   p_value  \
0          G1          G1  Adventure  1.000000   
1          G1          G2  Adventure  0.633721   
2          G1          G3  Adventure  0.050329   
3          G1          G4  Adventure  0.000049   
4          G2          G1  Adventure  0.633721   

                              Outcome  
0  Fail to reject the null hypothesis  
1  Fail to reject the null hypothesis  
2  Fail to reject the null hypothesis  
3          Reject the null hypothesis  
4  Fail to reject the null hypothesis  