In [3]:
# IPL Data Analytics Tutorial - Day 3
# Modules: Data Formats, Data Modelling, Descriptive Statistics, Queries

import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations
import matplotlib.pyplot as plt  # For creating visualizations
import seaborn as sns  # For statistical visualizations
from datetime import datetime  # For date operations

print("🏏 Welcome to IPL Data Analytics - Day 3!")
print("=" * 50)

# ==========================================
# MODULE 1: DATA FORMATS & PYTHON BASICS
# ==========================================

print("\n📊 MODULE 1: DATA FORMATS & PYTHON BASICS")
print("-" * 40)

# Let's create a sample IPL dataset to understand data formats
# In real scenarios, you'd load this from CSV files we found online

# Creating sample IPL match data (this simulates what you'd get from Kaggle/GitHub)
ipl_matches = {
    'match_id': [1, 2, 3, 4, 5, 6, 7, 8],
    'season': [2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024],
    'team1': ['Mumbai Indians', 'Chennai Super Kings', 'Royal Challengers Bangalore', 
              'Kolkata Knight Riders', 'Delhi Capitals', 'Punjab Kings', 
              'Rajasthan Royals', 'Sunrisers Hyderabad'],
    'team2': ['Chennai Super Kings', 'Royal Challengers Bangalore', 'Mumbai Indians',
              'Delhi Capitals', 'Punjab Kings', 'Rajasthan Royals',
              'Sunrisers Hyderabad', 'Kolkata Knight Riders'],
    'winner': ['Mumbai Indians', 'Chennai Super Kings', 'Mumbai Indians',
               'Kolkata Knight Riders', 'Delhi Capitals', 'Rajasthan Royals',
               'Rajasthan Royals', 'Sunrisers Hyderabad'],
    'runs_team1': [185, 178, 192, 165, 201, 156, 188, 174],
    'runs_team2': [182, 175, 189, 163, 198, 159, 185, 176],
    'venue': ['Wankhede Stadium', 'M. A. Chidambaram Stadium', 'M. Chinnaswamy Stadium',
              'Eden Gardens', 'Arun Jaitley Stadium', 'PCA Stadium',
              'Sawai Mansingh Stadium', 'Rajiv Gandhi International Stadium'],
    'margin_runs': [3, 3, 3, 2, 3, -3, 3, -2],  # Negative means team2 won
    'margin_wickets': [0, 0, 0, 0, 0, 3, 0, 2]   # 0 means won by runs
}

# Converting dictionary to DataFrame (this is how we work with tabular data in Python)
df_matches = pd.DataFrame(ipl_matches)

print("✅ Created IPL matches DataFrame")
print("Shape of our data:", df_matches.shape)  # Shows (rows, columns)
print("\nFirst 3 rows of our dataset:")
print(df_matches.head(3))

# Understanding different data formats
print("\n🔍 DATA FORMATS EXPLANATION:")
print("1. CSV (Comma Separated Values) - Most common for tabular data")
print("2. JSON (JavaScript Object Notation) - Good for nested/hierarchical data")
print("3. DataFrame - Pandas structure for data analysis in Python")

# Data types in our dataset
print("\n📋 Data Types in our dataset:")
print(df_matches.dtypes)

🏏 Welcome to IPL Data Analytics - Day 3!

📊 MODULE 1: DATA FORMATS & PYTHON BASICS
----------------------------------------
✅ Created IPL matches DataFrame
Shape of our data: (8, 10)

First 3 rows of our dataset:
   match_id  season                        team1                        team2  \
0         1    2024               Mumbai Indians          Chennai Super Kings   
1         2    2024          Chennai Super Kings  Royal Challengers Bangalore   
2         3    2024  Royal Challengers Bangalore               Mumbai Indians   

                winner  runs_team1  runs_team2                      venue  \
0       Mumbai Indians         185         182           Wankhede Stadium   
1  Chennai Super Kings         178         175  M. A. Chidambaram Stadium   
2       Mumbai Indians         192         189     M. Chinnaswamy Stadium   

   margin_runs  margin_wickets  
0            3               0  
1            3               0  
2            3               0  

🔍 DATA FORMATS EXPLA

In [5]:
# ==========================================
# MODULE 2: DATA MODELLING
# ==========================================

print("\n\n🏗️ MODULE 2: DATA MODELLING")
print("-" * 40)

# Data modelling means organizing our data in a logical structure
# Let's create additional related tables (like a proper database)

# Player performance data
player_stats = {
    'player_name': ['Virat Kohli', 'MS Dhoni', 'Rohit Sharma', 'KL Rahul', 
                   'Rishabh Pant', 'Mayank Agarwal', 'Jos Buttler', 'Kane Williamson'],
    'team': ['Royal Challengers Bangalore', 'Chennai Super Kings', 'Mumbai Indians', 
             'Punjab Kings', 'Delhi Capitals', 'Punjab Kings', 'Rajasthan Royals', 
             'Sunrisers Hyderabad'],
    'matches_played': [8, 8, 8, 8, 8, 8, 8, 8],
    'runs_scored': [342, 289, 367, 421, 308, 198, 456, 234],
    'average': [42.75, 36.12, 45.87, 52.62, 38.50, 24.75, 57.00, 29.25],
    'strike_rate': [138.5, 142.3, 145.2, 135.8, 149.1, 128.4, 152.3, 115.2],
    'centuries': [1, 0, 1, 2, 0, 0, 2, 0],
    'fifties': [2, 3, 2, 2, 3, 1, 2, 2]
}

df_players = pd.DataFrame(player_stats)

print("✅ Created player statistics DataFrame")
print("Player data shape:", df_players.shape)
print("\nTop 3 players by runs:")
print(df_players.head(3))

# Team statistics
team_stats = {
    'team_name': ['Mumbai Indians', 'Chennai Super Kings', 'Royal Challengers Bangalore', 
                  'Kolkata Knight Riders', 'Delhi Capitals', 'Punjab Kings', 
                  'Rajasthan Royals', 'Sunrisers Hyderabad'],
    'matches_played': [8, 8, 8, 8, 8, 8, 8, 8],
    'wins': [4, 3, 2, 3, 4, 2, 5, 3],
    'losses': [4, 5, 6, 5, 4, 6, 3, 5],
    'points': [8, 6, 4, 6, 8, 4, 10, 6],
    'net_run_rate': [0.45, -0.23, -0.67, 0.12, 0.78, -0.89, 1.23, 0.34]
}

df_teams = pd.DataFrame(team_stats)

print("\n✅ Created team statistics DataFrame")
print("Team data shape:", df_teams.shape)

# Data Modelling Principles Applied:
print("\n🎯 DATA MODELLING PRINCIPLES APPLIED:")
print("1. Normalization: Split data into logical tables (matches, players, teams)")
print("2. Relationships: Teams appear in both matches and player tables")
print("3. Data Integrity: Consistent team names across tables")
print("4. Scalability: Easy to add more matches, players, or teams")



🏗️ MODULE 2: DATA MODELLING
----------------------------------------
✅ Created player statistics DataFrame
Player data shape: (8, 8)

Top 3 players by runs:
    player_name                         team  matches_played  runs_scored  \
0   Virat Kohli  Royal Challengers Bangalore               8          342   
1      MS Dhoni          Chennai Super Kings               8          289   
2  Rohit Sharma               Mumbai Indians               8          367   

   average  strike_rate  centuries  fifties  
0    42.75        138.5          1        2  
1    36.12        142.3          0        3  
2    45.87        145.2          1        2  

✅ Created team statistics DataFrame
Team data shape: (8, 6)

🎯 DATA MODELLING PRINCIPLES APPLIED:
1. Normalization: Split data into logical tables (matches, players, teams)
2. Relationships: Teams appear in both matches and player tables
3. Data Integrity: Consistent team names across tables
4. Scalability: Easy to add more matches, players, or 

In [7]:
# ==========================================
# MODULE 3: DESCRIPTIVE STATISTICS
# ==========================================

print("\n\n📈 MODULE 3: DESCRIPTIVE STATISTICS")
print("-" * 40)

# Descriptive statistics help us understand our data through numbers
print("🔢 BASIC STATISTICS FOR PLAYER RUNS:")

# Basic statistical measures
runs_stats = df_players['runs_scored'].describe()
print(runs_stats)

print("\n📊 WHAT THESE STATISTICS MEAN:")
print(f"• Mean (Average): {runs_stats['mean']:.1f} runs per player")
print(f"• Median (50th percentile): {runs_stats['50%']:.1f} runs")
print(f"• Standard Deviation: {runs_stats['std']:.1f} (shows spread)")
print(f"• Minimum: {runs_stats['min']:.0f} runs")
print(f"• Maximum: {runs_stats['max']:.0f} runs")

# More detailed analysis
print("\n🎯 ADDITIONAL INSIGHTS:")
total_runs = df_players['runs_scored'].sum()
avg_strike_rate = df_players['strike_rate'].mean()
print(f"• Total runs by all players: {total_runs}")
print(f"• Average strike rate: {avg_strike_rate:.1f}")

# Finding top performers
top_scorer = df_players.loc[df_players['runs_scored'].idxmax()]
print(f"• Top scorer: {top_scorer['player_name']} with {top_scorer['runs_scored']} runs")

highest_avg = df_players.loc[df_players['average'].idxmax()]
print(f"• Highest average: {highest_avg['player_name']} with {highest_avg['average']}")

# Team statistics
print("\n🏆 TEAM PERFORMANCE STATISTICS:")
team_wins_stats = df_teams['wins'].describe()
print(f"• Average wins per team: {team_wins_stats['mean']:.1f}")
print(f"• Most wins: {df_teams['wins'].max()}")
print(f"• Least wins: {df_teams['wins'].min()}")

# Correlation analysis (relationship between variables)
correlation = df_players['runs_scored'].corr(df_players['average'])
print(f"\n🔗 CORRELATION INSIGHT:")
print(f"• Runs scored vs Average correlation: {correlation:.3f}")
print("  (Values close to 1 show strong positive relationship)")

# ==========================================
# MODULE 4: QUERIES (Data Filtering)
# ==========================================

print("\n\n🔍 MODULE 4: QUERIES (Data Filtering)")
print("-" * 40)

# Queries help us filter and extract specific information from our data
print("BASIC QUERIES ON IPL DATA:")

# Query 1: Players with more than 300 runs
high_scorers = df_players[df_players['runs_scored'] > 300]
print(f"\n1️⃣ Players with 300+ runs: {len(high_scorers)}")
print(high_scorers[['player_name', 'runs_scored', 'team']].to_string(index=False))

# Query 2: Teams with positive net run rate
positive_nrr = df_teams[df_teams['net_run_rate'] > 0]
print(f"\n2️⃣ Teams with positive Net Run Rate: {len(positive_nrr)}")
print(positive_nrr[['team_name', 'wins', 'net_run_rate']].to_string(index=False))

# Query 3: Players from specific teams
mi_csk_players = df_players[df_players['team'].isin(['Mumbai Indians', 'Chennai Super Kings'])]
print(f"\n3️⃣ Players from MI and CSK: {len(mi_csk_players)}")
print(mi_csk_players[['player_name', 'team', 'runs_scored']].to_string(index=False))

# Query 4: Complex query - Players with high strike rate AND good average
power_players = df_players[(df_players['strike_rate'] > 140) & (df_players['average'] > 35)]
print(f"\n4️⃣ Power players (SR>140 & Avg>35): {len(power_players)}")
print(power_players[['player_name', 'strike_rate', 'average']].to_string(index=False))

# Query 5: Matches won by small margins
close_matches = df_matches[(df_matches['margin_runs'] <= 5) & (df_matches['margin_runs'] > 0)]
print(f"\n5️⃣ Close matches (won by ≤5 runs): {len(close_matches)}")
print(close_matches[['team1', 'team2', 'winner', 'margin_runs']].to_string(index=False))

# ==========================================
# PRACTICAL EXERCISE
# ==========================================

print("\n\n🎯 PRACTICAL EXERCISE")
print("-" * 40)
print("Try these queries yourself:")
print("1. Find players with strike rate > 150")
print("2. Find teams with more than 6 wins")
print("3. Calculate average runs per match")
print("4. Find the team with best net run rate")

# Solutions (you can uncomment to see results):
# exercise1 = df_players[df_players['strike_rate'] > 150]
# exercise2 = df_teams[df_teams['wins'] > 6]
# exercise3 = df_matches[['runs_team1', 'runs_team2']].mean().mean()
# exercise4 = df_teams.loc[df_teams['net_run_rate'].idxmax()]

print("\n🎉 Congratulations! You've completed Day 3 of IPL Data Analytics")
print("Next session: We'll explore Data Visualization and Machine Learning!")

# ==========================================
# HOMEWORK/PRACTICE
# ==========================================

print("\n\n📝 HOMEWORK FOR NEXT SESSION:")
print("-" * 40)
print("1. Download real IPL dataset from Kaggle")
print("2. Practice loading CSV files using pd.read_csv()")
print("3. Try creating 5 different queries on the data")
print("4. Calculate basic statistics for different columns")
print("5. Think about what patterns you'd like to discover in IPL data")

print("\n" + "="*50)
print("🏏 END OF DAY 3 TUTORIAL 🏏")
print("="*50)



📈 MODULE 3: DESCRIPTIVE STATISTICS
----------------------------------------
🔢 BASIC STATISTICS FOR PLAYER RUNS:
count      8.000000
mean     326.875000
std       88.153174
min      198.000000
25%      275.250000
50%      325.000000
75%      380.500000
max      456.000000
Name: runs_scored, dtype: float64

📊 WHAT THESE STATISTICS MEAN:
• Mean (Average): 326.9 runs per player
• Median (50th percentile): 325.0 runs
• Standard Deviation: 88.2 (shows spread)
• Minimum: 198 runs
• Maximum: 456 runs

🎯 ADDITIONAL INSIGHTS:
• Total runs by all players: 2615
• Average strike rate: 138.3
• Top scorer: Jos Buttler with 456 runs
• Highest average: Jos Buttler with 57.0

🏆 TEAM PERFORMANCE STATISTICS:
• Average wins per team: 3.2
• Most wins: 5
• Least wins: 2

🔗 CORRELATION INSIGHT:
• Runs scored vs Average correlation: 1.000
  (Values close to 1 show strong positive relationship)


🔍 MODULE 4: QUERIES (Data Filtering)
----------------------------------------
BASIC QUERIES ON IPL DATA:

1️⃣ Play