In [91]:
import numpy as np
import pandas as pd

In [92]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [93]:
df = pd.read_parquet('cleaned/movielens_parquet')

Exporting and importing in parquet leaded to the the conversion of the lists in movielens data in review_data column to numpy arrays. We need to convert them back to lists to use the same approach as Netflix to take samples.

*@Jaume maybe for you this is not the case because you have ios.*

In [94]:
# convert numpy arrays to lists in the 'review_data' column
df['review_data'] = df['review_data'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

In [95]:
# Step 1: Count the number of dictionaries per row in the review_data column, replacing NaN with 0
df['num_reviews'] = df['review_data'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Step 2: Divide the dataset into strata based on the number of reviews for each movie
quintiles = df['num_reviews'].quantile([0, 0.20, 0.40, 0.60, 0.80, 1.0])

# Adjust the boundaries to ensure monotonic increase
stratum_boundaries = [0, quintiles[0.20], quintiles[0.40], quintiles[0.60], quintiles[0.80], quintiles[1.0]]
stratum_labels = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']

# Create a new column to categorize movies into strata based on the number of reviews
df['review_stratum'] = pd.cut(df['num_reviews'], bins=stratum_boundaries, labels=stratum_labels)

In [96]:
import random

sampled_df = df[df.columns]

# Step 1: Count the number of dictionaries per row in the review_data column
sampled_df['num_reviews'] = df['review_data'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Step 2: Divide the dataset into strata based on the number of reviews for each movie
quintiles = sampled_df['num_reviews'].quantile([0, 0.20, 0.40, 0.60, 0.80, 1.0])
# Adjust the boundaries to include fewer movies with a large number of reviews
stratum_boundaries = [0, quintiles[0.20], quintiles[0.40], quintiles[0.60], quintiles[0.80], quintiles[1.0]]
stratum_labels = ['Q1', 'Q2', 'Q3', 'Q4','Q5']

# Create a new column to categorize movies into strata based on the number of reviews
sampled_df['review_stratum'] = pd.cut(sampled_df['num_reviews'], bins=stratum_boundaries, labels=stratum_labels)

# Step 3: Define sample size per stratum
sample_size_per_stratum = 30

# Step 4: Within each stratum, apply random sampling techniques to select movies
sampled_movies = []

# Iterate over each stratum
for stratum in sampled_df['review_stratum'].dropna().unique():  # Drop NaN values
    # Filter movies in the current stratum
    stratum_movies = sampled_df[sampled_df['review_stratum'] == stratum]
    
    # Apply simple random sampling to select movies within the stratum
    sampled_indices = random.sample(list(stratum_movies.index), sample_size_per_stratum)
    sampled_movies.extend(sampled_indices)

# Step 5: Create the sampled DataFrame
sampled_df_movielens = sampled_df.loc[sampled_movies, ['movieId', 'review_data', 'genres', 'year', 'title', 'review_stratum','num_reviews']]

In [97]:
# Extract all user IDs from the 'review_data' column using list comprehension
user_ids = [review_entry.get('userId') for row in sampled_df_movielens['review_data'] for review_entry in row if review_entry.get('userId')]

# Count the number of unique users and reviews
unique_users = set(user_ids)
amount_of_reviews = len(user_ids)

# Calculate averages
avg_reviews_per_unique_user = amount_of_reviews / len(unique_users)
avg_reviews_per_movie_id = amount_of_reviews / len(sampled_df_movielens)

# Print results
print("There are {} reviews in the sampled dataframe.".format(amount_of_reviews))
print("There are {} unique users who have reviewed a movie.".format(len(unique_users)))
print("There are {} movieIds in the sampled dataset.".format(len(sampled_df_movielens)))
print("A unique user places {} reviews on average in the sampled dataset.".format(round(avg_reviews_per_unique_user)))
print("A movieId receives {} reviews on average in the sampled dataset.".format(round(avg_reviews_per_movie_id)))

There are 6439 reviews in the sampled dataframe.
There are 5660 unique users who have reviewed a movie.
There are 150 movieIds in the sampled dataset.
A unique user places 1 reviews on average in the sampled dataset.
A movieId receives 43 reviews on average in the sampled dataset.


In [98]:
from scipy.stats import ttest_ind

# Define the strata
strata = sampled_df_movielens['review_stratum'].unique()

# Perform t-tests for each stratum
t_statistics = {}
p_values = {}
for stratum in strata:
    # Extract the 'num_reviews' column for the current stratum
    sampled_num_reviews_stratum = sampled_df_movielens[sampled_df_movielens['review_stratum'] == stratum]['num_reviews']
    population_num_reviews_stratum = df[df['review_stratum'] == stratum]['num_reviews']
    
    # Perform the t-test
    t_statistic, p_value = ttest_ind(sampled_num_reviews_stratum, population_num_reviews_stratum)
    
    # Store the results
    t_statistics[stratum] = t_statistic
    p_values[stratum] = p_value

# Print the results
print("T-test Results:")
for stratum in strata:
    print(f"Stratum: {stratum}")
    print(f"T-statistic: {t_statistics[stratum]}")
    print(f"P-value: {p_values[stratum]}")
    alpha = 0.05
    if p_values[stratum] < alpha:
        print("The difference in means is statistically significant (reject the null hypothesis)")
    else:
        print("The difference in means is not statistically significant (fail to reject the null hypothesis)")


T-test Results:
Stratum: Q5
T-statistic: -1.1485336059970532
P-value: 0.2507756318586446
The difference in means is not statistically significant (fail to reject the null hypothesis)
Stratum: Q4
T-statistic: 1.0565563376798446
P-value: 0.29074414710912544
The difference in means is not statistically significant (fail to reject the null hypothesis)
Stratum: Q3
T-statistic: -1.41383355660149
P-value: 0.1574657668091603
The difference in means is not statistically significant (fail to reject the null hypothesis)
Stratum: Q2
T-statistic: nan
P-value: nan
The difference in means is not statistically significant (fail to reject the null hypothesis)
Stratum: Q1
T-statistic: nan
P-value: nan
The difference in means is not statistically significant (fail to reject the null hypothesis)


  res = hypotest_fun_out(*samples, **kwds)


Differences in mean betweens sample and df is not significant, meaning the sample is representatitve!

In [99]:
sampled_df_movielens.to_parquet('cleaned/strat_sample_movielens')