In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway, stats

#### Loading Cleaned Data
We begin by loading our cleaned dataset and selecting only the relevant columns for this analysis: style and final_place.

In [None]:
# Load your dataset
df = pd.read_csv('Data/finalists_cleaned.csv')

In [None]:
df = df[['style', 'final_place']].dropna()

#### Quick Peek at the Data

In [None]:
# Show first few rows
print("Sample data:")
print(df.head())

In [None]:
df.describe

#### Explore Style Distribution
Check how many entries belong to each style. This helps us understand if some styles are overrepresented.

In [None]:
df['style'].value_counts()

We also prepare a version of the style column that includes the count for easier interpretation in plots:

In [None]:
style_counts = df['style'].value_counts()

In [None]:
#Add a column that are used for visualization
df['style_with_count'] = df['style'].apply(lambda x: f"{x} (n={style_counts[x]})")

####  Define Winning Entries
Add a column to flag whether a participant won (i.e., finished in first place).

In [None]:
# Add a column for winners
df['is_winner'] = (df['final_place'] == 1).astype(int)

In [None]:
# Calculate win rate per genre
win_rates = df.groupby('style')['is_winner'].mean().sort_values(ascending=False)
win_rates

Convert the result into a clean DataFrame:

In [None]:
win_rates_df = win_rates.reset_index()  # This creates a DataFrame with two columns
win_rates_df.columns = ['Style', 'Win Rate']  # Rename those two columns properly

#### Statistical Analysis: ANOVA Test
We use one-way ANOVA to test whether there are statistically significant differences in final placement across styles.

In [None]:
grouped = [group['final_place'].values for _, group in df.groupby('style')]

In [None]:
# Perform ANOVA test
f_stat, p_val = f_oneway(*grouped)

In [None]:
print("\n=== ANOVA Test Results ===")
print(f"F-statistic: {f_stat:.2f}")
print(f"p-value: {p_val:.4f}")

In [None]:
# Interpret the p-value
if p_val < 0.05:
    print("✅ There is a statistically significant difference in final placement between styles.")
else:
    print("❌ No statistically significant difference found between styles.")

#### Visualization: Final Placement by Style
A boxplot helps visualize the distribution of placements per style, including medians, quartiles, and outliers.

In [None]:
# Visualize: Boxplot of final_place grouped by style
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='style_with_count', y='final_place')
plt.title("Final Placement by Style (with Entry Counts)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### Visualization: Win Rates by Style
We now visualize the percentage of winners in each style using a bar chart.

In [None]:
# Convert win rates to percentage
win_rates_df['Win Rate'] = win_rates_df['Win Rate'] * 100

plt.figure(figsize=(8,5))
sns.barplot(data=win_rates_df, x='Style', y='Win Rate', hue='Style', palette='viridis', legend=False)
plt.title('Win Rates by Style (%)')
plt.ylabel('Win Rate (%)')
plt.xlabel('Style')
plt.ylim(0, win_rates_df['Win Rate'].max() + 2)  # little padding on top

# Add percentage labels on top of the bars
for i, rate in enumerate(win_rates_df['Win Rate']):
    plt.text(i, rate + 0.3, f"{rate:.2f}%", ha='center')

plt.show()
