In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib.backends.backend_pdf import PdfPages

In [6]:
# Set style for better visuals
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [7]:
# Load external dataset
url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/star-wars-survey/StarWars.csv'
df = pd.read_csv(url, encoding='ISO-8859-1')

In [8]:
# Data preprocessing
# Select relevant columns and clean data
df = df.rename(columns={
    'Have you seen any of the 6 films in the Star Wars franchise?': 'Seen_Any_Film',
    'Do you consider yourself to be a fan of the Star Wars film franchise?': 'Fan_Status',
    'Which of the following Star Wars films have you seen? Please select all that apply.': 'Seen_Ep1',
    'Unnamed: 4': 'Seen_Ep2',
    'Unnamed: 5': 'Seen_Ep3',
    'Unnamed: 6': 'Seen_Ep4',
    'Unnamed: 7': 'Seen_Ep5',
    'Unnamed: 8': 'Seen_Ep6',
    'Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.': 'Rank_Ep1',
    'Unnamed: 10': 'Rank_Ep2',
    'Unnamed: 11': 'Rank_Ep3',
    'Unnamed: 12': 'Rank_Ep4',
    'Unnamed: 13': 'Rank_Ep5',
    'Unnamed: 14': 'Rank_Ep6',
    'Please state whether you view the following characters favorably, unfavorably, or are unfamiliar with him/her.': 'Han_Solo_Rating',
    'Unnamed: 16': 'Luke_Skywalker_Rating'
})

In [9]:
#Clean and preprocess
df = df.iloc[1:]  # Remove first row (question descriptions)
df['Fan_Status'] = df['Fan_Status'].fillna('No').replace({'Yes': 'Yes', 'No': 'No'})
df['Seen_Any_Film'] = df['Seen_Any_Film'].replace({'Yes': 'Yes', 'No': 'No'})
df[['Rank_Ep1', 'Rank_Ep2', 'Rank_Ep3', 'Rank_Ep4', 'Rank_Ep5', 'Rank_Ep6']] = df[
    ['Rank_Ep1', 'Rank_Ep2', 'Rank_Ep3', 'Rank_Ep4', 'Rank_Ep5', 'Rank_Ep6']
].apply(pd.to_numeric, errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Fan_Status'] = df['Fan_Status'].fillna('No').replace({'Yes': 'Yes', 'No': 'No'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Seen_Any_Film'] = df['Seen_Any_Film'].replace({'Yes': 'Yes', 'No': 'No'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Rank_Ep1', 'Rank_Ep2', 'Rank_Ep3', 'R

In [10]:
# Create correlation matrix for movie rankings
corr_matrix = df[['Rank_Ep1', 'Rank_Ep2', 'Rank_Ep3', 'Rank_Ep4', 'Rank_Ep5', 'Rank_Ep6']].corr()

In [11]:
# Calculate average rankings by fan status
avg_rankings = df.groupby('Fan_Status')[['Rank_Ep1', 'Rank_Ep2', 'Rank_Ep3', 'Rank_Ep4', 'Rank_Ep5', 'Rank_Ep6']].mean().reset_index()

In [15]:
# Create PDF for static visualizations
with PdfPages('star_wars_survey_visualizations.pdf') as pdf:
    # Bar Chart: Fan Status by Seen Any Film
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='Seen_Any_Film', hue='Fan_Status')
    plt.title('Fan Status by Whether Respondents Have Seen Any Star Wars Film')
    plt.xlabel('Seen Any Star Wars Film')
    plt.ylabel('Count')
    plt.legend(title='Fan Status')
    plt.tight_layout()
    pdf.savefig()
    plt.close()

    # Pie Chart: Fan Status Distribution
    plt.figure(figsize=(8, 8))
    fan_counts = df['Fan_Status'].value_counts()
    plt.pie(fan_counts, labels=fan_counts.index, autopct='%1.1f%%', startangle=90)
    plt.title('Star Wars Fan Status Distribution')
    plt.tight_layout()
    pdf.savefig()
    plt.close()

In [21]:
# Pie Chart: Fan Status Distribution
plt.figure(figsize=(8, 8))
fan_counts = df['Fan_Status'].value_counts()
plt.pie(fan_counts, labels=fan_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Star Wars Fan Status Distribution')
plt.tight_layout()
pdf.savefig()
plt.close()

In [23]:
# Heatmap: Correlation of Movie Rankings
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Star Wars Movie Rankings')
plt.tight_layout()
pdf.savefig()
plt.close()

In [24]:
# Interactive Dashboard with Plotly
fig = make_subplots(rows=2, cols=2,
                    specs=[[{'type': 'bar'}, {'type': 'pie'}],
                           [{'type': 'heatmap'}, {'type': 'bar'}]],
                    subplot_titles=('Fan Status by Seen Any Film', 'Fan Status Distribution',
                                    'Correlation Matrix of Rankings', 'Average Rankings by Fan Status'))

In [25]:
# Fan Status Bar Chart
fan_counts = df.groupby(['Seen_Any_Film', 'Fan_Status']).size().unstack().fillna(0)
for fan_status in fan_counts.columns:
    fig.add_trace(
        go.Bar(x=fan_counts.index, y=fan_counts[fan_status], name=fan_status),
        row=1, col=1
    )

In [27]:
# Fan Status Pie Chart
fan_counts_total = df['Fan_Status'].value_counts()
fig.add_trace(
    go.Pie(labels=fan_counts_total.index, values=fan_counts_total.values, name='Fan Status'),
    row=1, col=2
)

# Correlation Heatmap
fig.add_trace(
    go.Heatmap(z=corr_matrix.values, x=corr_matrix.columns, y=corr_matrix.index,
               colorscale='RdBu', zmin=-1, zmax=1),
    row=2, col=1
)

# Average Rankings Bar Chart
for episode in ['Rank_Ep1', 'Rank_Ep2', 'Rank_Ep3', 'Rank_Ep4', 'Rank_Ep5', 'Rank_Ep6']:
    fig.add_trace(
        go.Bar(x=avg_rankings['Fan_Status'], y=avg_rankings[episode],
               name=episode.replace('Rank_', 'Episode ')),
        row=2, col=2
    )

# Update layout
fig.update_layout(
    height=800,
    width=1000,
    title_text='Star Wars Survey Dashboard',
    showlegend=True
)

# Save as HTML
fig.write_html('star_wars_survey_dashboard.html')

# Print key takeaways
print("Key Takeaways:")
print("1. Fan Status by Viewing: The bar chart shows the relationship between whether respondents have seen any Star Wars films and their fan status.")
print("2. Fan Distribution: The pie chart indicates the proportion of respondents who identify as Star Wars fans.")
print("3. Ranking Correlations: The heatmap reveals how rankings of different Star Wars episodes correlate with each other.")
print("4. Rankings by Fan Status: The average rankings bar chart highlights differences in movie preferences between fans and non-fans.")
print("\nOutputs:")
print("- Static visualizations saved as 'star_wars_survey_visualizations.pdf'")
print("- Interactive dashboard saved as 'star_wars_survey_dashboard.html'")

Key Takeaways:
1. Fan Status by Viewing: The bar chart shows the relationship between whether respondents have seen any Star Wars films and their fan status.
2. Fan Distribution: The pie chart indicates the proportion of respondents who identify as Star Wars fans.
3. Ranking Correlations: The heatmap reveals how rankings of different Star Wars episodes correlate with each other.
4. Rankings by Fan Status: The average rankings bar chart highlights differences in movie preferences between fans and non-fans.

Outputs:
- Static visualizations saved as 'star_wars_survey_visualizations.pdf'
- Interactive dashboard saved as 'star_wars_survey_dashboard.html'
