# Premier League 2024-25 Data Analysis - Part 2: Statistical Analysis & Visualization

This notebook focuses on applying statistical techniques and creating visualizations to gain insights from the Premier League 2024-25 dataset.

## 1. Import Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = [12, 8]

# Display settings
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

## 2. Load the Cleaned Data

In [None]:
# Load the cleaned dataset
cleaned_file_path = '../data/pl_2024_25_cleaned.csv'
df = pd.read_csv(cleaned_file_path)

# Display the first few rows of the cleaned dataset
print(f"Dataset shape: {df.shape}")
df.head()

## 3. Descriptive Statistics

In [None]:
# Generate comprehensive descriptive statistics
desc_stats = df.describe(include='all').T

# Add additional statistical measures
desc_stats['skewness'] = df.skew(numeric_only=True)
desc_stats['kurtosis'] = df.kurtosis(numeric_only=True)

# Display the results
desc_stats

## 4. Key Performance Metrics Visualization

In [None]:
# Example visualization for key performance metrics
# (assuming columns like 'team', 'goals', 'points', etc. exist - to be modified based on actual data)

# Top teams by points
# plt.figure(figsize=(14, 8))
# sns.barplot(x='points', y='team', data=df.sort_values('points', ascending=False).head(10))
# plt.title('Top 10 Teams by Points')
# plt.xlabel('Points')
# plt.ylabel('Team')
# plt.tight_layout()
# plt.show()

In [None]:
# Goals scored vs. goals conceded
# (assuming columns like 'team', 'goals_for', 'goals_against' exist - to be modified based on actual data)

# plt.figure(figsize=(12, 10))
# plt.scatter(df['goals_for'], df['goals_against'], s=100, alpha=0.7)
# 
# for i, team in enumerate(df['team']):
#     plt.annotate(team, 
#                  (df['goals_for'].iloc[i], df['goals_against'].iloc[i]),
#                  xytext=(5, 5), textcoords='offset points')
#     
# plt.axhline(y=df['goals_against'].mean(), color='r', linestyle='--', alpha=0.3, label='Avg Goals Against')
# plt.axvline(x=df['goals_for'].mean(), color='g', linestyle='--', alpha=0.3, label='Avg Goals For')
# 
# plt.title('Goals Scored vs. Goals Conceded by Team')
# plt.xlabel('Goals Scored')
# plt.ylabel('Goals Conceded')
# plt.grid(True, alpha=0.3)
# plt.legend()
# plt.tight_layout()
# plt.show()

## 5. Statistical Tests

In [None]:
# Example: t-test to compare home vs. away performance
# (assuming columns like 'home_goals', 'away_goals' exist - to be modified based on actual data)

# t_stat, p_value = stats.ttest_ind(df['home_goals'], df['away_goals'])
# print(f"T-test for Home vs. Away Goals:")
# print(f"t-statistic: {t_stat:.4f}")
# print(f"p-value: {p_value:.4f}")
# print(f"Conclusion: {'Statistically significant difference' if p_value < 0.05 else 'No statistically significant difference'}")

In [None]:
# Example: Chi-square test for association between categorical variables
# (to be modified based on actual data)

# contingency_table = pd.crosstab(df['result'], df['home_away'])
# chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
# 
# print(f"Contingency table:\n{contingency_table}\n")
# print(f"Chi-square test for association between Result and Home/Away:")
# print(f"Chi2 value: {chi2:.4f}")
# print(f"p-value: {p:.4f}")
# print(f"Conclusion: {'Statistically significant association' if p < 0.05 else 'No statistically significant association'}")

## 6. Correlation Analysis

In [None]:
# Select numerical columns for correlation analysis
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Calculate correlation matrix
corr_matrix = df[numerical_cols].corr()

# Create heatmap
plt.figure(figsize=(16, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Identify highly correlated features
threshold = 0.7
high_corr = (corr_matrix.abs() > threshold) & (corr_matrix != 1.0)

# Get pairs of highly correlated features
correlated_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if high_corr.iloc[i, j]:
            correlated_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

# Display the results
if correlated_pairs:
    corr_df = pd.DataFrame(correlated_pairs, columns=['Feature 1', 'Feature 2', 'Correlation'])
    corr_df = corr_df.sort_values('Correlation', ascending=False)
    print(f"Highly correlated features (|r| > {threshold}):")
    corr_df
else:
    print(f"No feature pairs with correlation higher than {threshold}")

## 7. Advanced Visualizations

In [None]:
# Example: Create an interactive scatter plot using Plotly
# (to be modified based on actual data)

# fig = px.scatter(df, x='shots', y='goals', color='team', size='xG',
#                  hover_name='team', log_x=True, size_max=15,
#                  title='Relationship between Shots, Goals, and Expected Goals (xG)')
# 
# fig.update_layout(
#     xaxis_title='Number of Shots (log scale)',
#     yaxis_title='Number of Goals',
#     legend_title='Team',
#     height=600,
#     width=900
# )
# 
# fig.show()

In [None]:
# Example: Create a radar chart for team comparison
# (to be modified based on actual data)

# def create_radar_chart(team_names, metrics):
#     
#     # Filter data for selected teams
#     teams_data = df[df['team'].isin(team_names)]
#     
#     # Create figure
#     fig = go.Figure()
#     
#     # Add traces for each team
#     for team in team_names:
#         team_data = teams_data[teams_data['team'] == team]
#         values = team_data[metrics].values.flatten().tolist()
#         values += [values[0]]  # Close the loop
#         
#         fig.add_trace(go.Scatterpolar(
#             r=values,
#             theta=[*metrics, metrics[0]],
#             fill='toself',
#             name=team
#         ))
#     
#     # Update layout
#     fig.update_layout(
#         polar=dict(
#             radialaxis=dict(
#                 visible=True,
#             )
#         ),
#         title="Team Comparison across Key Metrics",
#         height=600,
#         width=800
#     )
#     
#     return fig
# 
# # Example usage (to be modified based on actual data)
# # team_names = ['Manchester City', 'Liverpool', 'Arsenal']
# # metrics = ['points', 'goals_for', 'goals_against', 'shots', 'possession']
# # radar_fig = create_radar_chart(team_names, metrics)
# # radar_fig.show()

## 8. Time Series Analysis

In [None]:
# Example: Analyze performance trends over time
# (assuming a date/match_week column exists - to be modified based on actual data)

# # Convert date column to datetime if needed
# # df['date'] = pd.to_datetime(df['date'])
# 
# # Group by date and calculate metrics
# # time_series = df.groupby('match_week').agg({
# #     'goals': 'sum',
# #     'shots': 'sum',
# #     'cards': 'sum'
# # }).reset_index()
# 
# # Plot time series
# # plt.figure(figsize=(15, 10))
# 
# # plt.subplot(3, 1, 1)
# # plt.plot(time_series['match_week'], time_series['goals'], marker='o', linestyle='-')
# # plt.title('Goals per Match Week')
# # plt.grid(True, alpha=0.3)
# 
# # plt.subplot(3, 1, 2)
# # plt.plot(time_series['match_week'], time_series['shots'], marker='s', linestyle='-', color='orange')
# # plt.title('Shots per Match Week')
# # plt.grid(True, alpha=0.3)
# 
# # plt.subplot(3, 1, 3)
# # plt.plot(time_series['match_week'], time_series['cards'], marker='^', linestyle='-', color='red')
# # plt.title('Cards per Match Week')
# # plt.grid(True, alpha=0.3)
# 
# # plt.tight_layout()
# # plt.show()

## 9. Hypothesis Testing

In [None]:
# Example: Test hypotheses about team performance
# (to be modified based on actual data)

# Hypothesis 1: Home teams score more goals than away teams
# t_stat, p_value = stats.ttest_ind(df['home_goals'], df['away_goals'])
# alpha = 0.05
# print(f"Hypothesis 1: Home teams score more goals than away teams")
# print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
# print(f"Result: {'Reject null hypothesis' if p_value < alpha else 'Fail to reject null hypothesis'}")
# print(f"Interpretation: {'There is a significant difference in goals scored by home vs. away teams' if p_value < alpha else 'There is no significant difference in goals scored by home vs. away teams'}\n")

# Hypothesis 2: Teams with higher possession have higher win rates
# correlation = df['possession'].corr(df['win_rate'])
# t_stat = correlation * np.sqrt((len(df) - 2) / (1 - correlation**2))
# p_value = 2 * (1 - stats.t.cdf(abs(t_stat), len(df) - 2))
# print(f"Hypothesis 2: Teams with higher possession have higher win rates")
# print(f"Correlation: {correlation:.4f}, t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
# print(f"Result: {'Reject null hypothesis' if p_value < alpha else 'Fail to reject null hypothesis'}")
# print(f"Interpretation: {'There is a significant correlation between possession and win rate' if p_value < alpha else 'There is no significant correlation between possession and win rate'}")

## 10. Probability Analysis

In [None]:
# Example: Calculate win probabilities based on various factors
# (to be modified based on actual data)

# # Calculate win probability by home/away status
# home_away_results = df.groupby('home_away').agg({
#     'wins': 'sum',
#     'games': 'sum'
# })
# home_away_results['win_probability'] = home_away_results['wins'] / home_away_results['games']
# 
# # Display results
# print("Win Probability by Home/Away Status:")
# print(home_away_results[['win_probability']])
# 
# # Visualize
# plt.figure(figsize=(10, 6))
# sns.barplot(x=home_away_results.index, y='win_probability', data=home_away_results)
# plt.title('Win Probability: Home vs. Away')
# plt.ylim(0, 1)
# plt.ylabel('Probability')
# plt.grid(axis='y', alpha=0.3)
# plt.show()

## 11. Save Visualizations

In [None]:
# Save key visualizations for the dashboard and report
# (to be implemented based on actual visualizations created)

# Example:
# plt.figure(figsize=(12, 8))
# sns.barplot(x='team', y='points', data=df.sort_values('points', ascending=False))
# plt.title('Teams by Points')
# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()
# plt.savefig('../visualizations/team_points.png', dpi=300)
# plt.close()

## 12. Next Steps

In the next notebook, we'll:
1. Prepare the data for machine learning models
2. Implement various prediction models
3. Evaluate model performance
4. Generate predictions