In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'MoviesOnStreamingPlatforms.csv'  # Update with your file path
data = pd.read_csv("C:\Users\Summer\Documents\Analysis\MoviesOnStreamingPlatforms.csv")

# Preprocessing the data
data['Rotten Tomatoes'] = data['Rotten Tomatoes'].str.replace('/100', '').astype(float)

# Filter the data for movies available on Netflix or Disney+
subset_data = data[(data['Netflix'] == 1) | (data['Disney+'] == 1)]

# Handle missing values
subset_data['Age'] = subset_data['Age'].fillna('Unknown')  # Replace missing Age values with 'Unknown'
subset_data = subset_data.dropna(subset=['Rotten Tomatoes'])  # Drop rows with missing Rotten Tomatoes

# Separate data for Netflix and Disney+
netflix_data = subset_data[subset_data['Netflix'] == 1]
disney_data = subset_data[subset_data['Disney+'] == 1]

# Descriptive statistics for Rotten Tomatoes scores
netflix_scores = netflix_data['Rotten Tomatoes']
disney_scores = disney_data['Rotten Tomatoes']

# Create histograms for Rotten Tomatoes scores
plt.figure(figsize=(12, 6))
sns.histplot(netflix_scores, kde=True, color='blue', label='Netflix', bins=20)
sns.histplot(disney_scores, kde=True, color='orange', label='Disney+', bins=20)
plt.title('Distribution of Rotten Tomatoes Scores')
plt.xlabel('Rotten Tomatoes Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Bar plot for age restriction distribution
plt.figure(figsize=(12, 6))
age_counts_netflix = netflix_data['Age'].value_counts()
age_counts_disney = disney_data['Age'].value_counts()

age_df = pd.DataFrame({'Netflix': age_counts_netflix, 'Disney+': age_counts_disney}).fillna(0)
age_df.plot(kind='bar', figsize=(12, 6), color=['blue', 'orange'], alpha=0.7)
plt.title('Distribution of Age Restrictions')
plt.xlabel('Age Restriction')
plt.ylabel('Number of Movies')
plt.legend(title='Platform')
plt.show()

# Summary statistics
summary_stats = {
    'Netflix': {
        'Rotten Tomatoes': netflix_scores.describe(),
        'Age Distribution': age_counts_netflix,
    },
    'Disney+': {
        'Rotten Tomatoes': disney_scores.describe(),
        'Age Distribution': age_counts_disney,
    }
}

# Display summary statistics
print("Netflix Summary Statistics:")
print(summary_stats['Netflix']['Rotten Tomatoes'])
print("\nNetflix Age Distribution:")
print(summary_stats['Netflix']['Age Distribution'])

print("\nDisney+ Summary Statistics:")
print(summary_stats['Disney+']['Rotten Tomatoes'])
print("\nDisney+ Age Distribution:")
print(summary_stats['Disney+']['Age Distribution'])


In [None]:
from scipy.stats import ttest_ind, mannwhitneyu

# Map Age restrictions to numeric values
age_mapping = {
    "18+": 18,
    "16+": 16,
    "13+": 13,
    "7+": 7,
    "all": 0,
    "Unknown": None
}

# Apply mapping
netflix_data['Age_Numeric'] = netflix_data['Age'].map(age_mapping)
disney_data['Age_Numeric'] = disney_data['Age'].map(age_mapping)

# Drop missing or unknown age restrictions
netflix_age = netflix_data['Age_Numeric'].dropna()
disney_age = disney_data['Age_Numeric'].dropna()


In [None]:
# Perform Mann-Whitney U-test for Age restrictions
age_test_result = mannwhitneyu(disney_age, netflix_age, alternative='less')
print("Mann-Whitney U Test (Age Restriction):", age_test_result)

# Perform t-test for Rotten Tomatoes scores
rotten_ttest_result = ttest_ind(disney_scores, netflix_scores, equal_var=False)
print("Two-sample T-Test (Rotten Tomatoes Scores):", rotten_ttest_result)


In [None]:
from scipy.stats import mannwhitneyu

# Map age restrictions to numeric values
age_mapping = {
    "18+": 18,
    "16+": 16,
    "13+": 13,
    "7+": 7,
    "all": 0,
    "Unknown": None
}

netflix_data['Age_Numeric'] = netflix_data['Age'].map(age_mapping)
disney_data['Age_Numeric'] = disney_data['Age'].map(age_mapping)

# Drop missing or unknown age restrictions
netflix_age = netflix_data['Age_Numeric'].dropna()
disney_age = disney_data['Age_Numeric'].dropna()


In [None]:
# Perform the Mann-Whitney U-test
u_stat, p_value = mannwhitneyu(disney_age, netflix_age, alternative='less')

print(f"Mann-Whitney U Statistic: {u_stat}")
print(f"P-Value: {p_value}")

# Interpret the result
if p_value < 0.05:
    print("Conclusion: Disney+ has significantly lower age restrictions than Netflix.")
else:
    print("Conclusion: No significant difference in age restrictions between Disney+ and Netflix.")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Summary Statistics for Age
age_stats_disney = disney_age.describe()
age_stats_netflix = netflix_age.describe()

# Bar Plot for Age Restrictions
plt.figure(figsize=(12, 6))
age_counts_netflix = netflix_data['Age'].value_counts()
age_counts_disney = disney_data['Age'].value_counts()

age_df = pd.DataFrame({'Netflix': age_counts_netflix, 'Disney+': age_counts_disney}).fillna(0)
age_df.plot(kind='bar', figsize=(12, 6), color=['blue', 'orange'], alpha=0.7)
plt.title('Distribution of Age Restrictions')
plt.xlabel('Age Restriction')
plt.ylabel('Number of Movies')
plt.legend(title='Platform')
plt.show()

# Summary Statistics for Rotten Tomatoes Scores
rotten_stats_disney = disney_scores.describe()
rotten_stats_netflix = netflix_scores.describe()

# Histogram for Rotten Tomatoes Scores
plt.figure(figsize=(12, 6))
sns.histplot(netflix_scores, kde=True, color='blue', label='Netflix', bins=20)
sns.histplot(disney_scores, kde=True, color='orange', label='Disney+', bins=20)
plt.title('Distribution of Rotten Tomatoes Scores')
plt.xlabel('Rotten Tomatoes Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
from scipy import ttest_ind, mannwhitneyu

# Mann-Whitney U-Test for Age Restrictions
u_stat, p_value_age = mannwhitneyu(disney_age, netflix_age, alternative='less')
print(f"Mann-Whitney U Statistic: {u_stat}, P-Value: {p_value_age}")

# Two-Sample t-Test for Rotten Tomatoes Scores
t_stat, p_value_rotten = ttest_ind(disney_scores, netflix_scores, equal_var=False)
print(f"Two-Sample T-Test Statistic: {t_stat}, P-Value: {p_value_rotten}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create histogram for Rotten Tomatoes scores
plt.figure(figsize=(12, 6))
sns.histplot(netflix_scores, kde=True, color='blue', label='Netflix', bins=20)
sns.histplot(disney_scores, kde=True, color='orange', label='Disney+', bins=20)
plt.title('Distribution of Rotten Tomatoes Scores')
plt.xlabel('Rotten Tomatoes Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
# Bar Plot for Age Restrictions
plt.figure(figsize=(12, 6))
age_counts_netflix = netflix_data['Age'].value_counts()
age_counts_disney = disney_data['Age'].value_counts()

age_df = pd.DataFrame({'Netflix': age_counts_netflix, 'Disney+': age_counts_disney}).fillna(0)
age_df.plot(kind='bar', figsize=(12, 6), color=['blue', 'orange'], alpha=0.7)
plt.title('Distribution of Age Restrictions')
plt.xlabel('Age Restriction')
plt.ylabel('Number of Movies')
plt.legend(title='Platform')
plt.show()
