In [30]:
import json
import gzip
from pathlib import Path
from collections import defaultdict, Counter
from typing import Any, Dict, List, Set
import pandas as pd

# Configuration
FOTMOB_DATA_DIR = Path("data/fotmob/matches")

print("FotMob Field Inspector Initialized ‚úì")


FotMob Field Inspector Initialized ‚úì


In [31]:
def load_fotmob_json(file_path: Path) -> Dict[str, Any]:
    """Load FotMob JSON file (handles both .json and .gz files)."""
    if file_path.suffix == '.gz':
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            return json.load(f)
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)

def get_all_fotmob_files(limit: int = None) -> List[Path]:
    """Get all FotMob match files."""
    files = []
    for date_dir in sorted(FOTMOB_DATA_DIR.glob("*")):
        if date_dir.is_dir():
            for file in sorted(date_dir.glob("match_*.json")) + sorted(date_dir.glob("match_*.gz")):
                files.append(file)
                if limit and len(files) >= limit:
                    return files
    return files

# Load sample files
all_files = get_all_fotmob_files()
print(f"Total FotMob files found: {len(all_files)}")
print(f"\nSample files:")
for f in all_files[:5]:
    print(f"  - {f}")


Total FotMob files found: 150

Sample files:
  - data\fotmob\matches\20251001\match_1000002056.json.gz
  - data\fotmob\matches\20251001\match_1000003608.json.gz
  - data\fotmob\matches\20251001\match_1000003609.json.gz
  - data\fotmob\matches\20251001\match_4692335.json.gz
  - data\fotmob\matches\20251001\match_4693111.json.gz


In [32]:
def extract_all_fields(obj: Any, prefix: str = "", field_info: Dict = None) -> Dict:
    """
    Recursively extract all fields from a nested JSON structure.
    Returns a dictionary with field paths as keys and metadata as values.
    """
    if field_info is None:
        field_info = defaultdict(lambda: {
            'types': set(),
            'sample_values': [],
            'null_count': 0,
            'total_count': 0,
            'is_list': False,
            'list_lengths': []
        })
    
    if obj is None:
        field_info[prefix]['null_count'] += 1
        field_info[prefix]['total_count'] += 1
        field_info[prefix]['types'].add('null')
        return field_info
    
    field_info[prefix]['total_count'] += 1
    
    if isinstance(obj, dict):
        field_info[prefix]['types'].add('dict')
        for key, value in obj.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            extract_all_fields(value, new_prefix, field_info)
    
    elif isinstance(obj, list):
        field_info[prefix]['is_list'] = True
        field_info[prefix]['list_lengths'].append(len(obj))
        field_info[prefix]['types'].add('list')
        
        # Sample first few items to understand list structure
        for i, item in enumerate(obj[:3]):  # Sample first 3 items
            new_prefix = f"{prefix}[{i}]"
            extract_all_fields(item, new_prefix, field_info)
    
    else:
        # Primitive type
        field_info[prefix]['types'].add(type(obj).__name__)
        if len(field_info[prefix]['sample_values']) < 5:
            field_info[prefix]['sample_values'].append(obj)
    
    return field_info

print("Field extraction functions defined ‚úì")


Field extraction functions defined ‚úì


In [33]:
# Analyze fields across multiple files
num_files_to_analyze = min(10, len(all_files))
print(f"Analyzing {num_files_to_analyze} FotMob files...\n")

all_field_data = defaultdict(lambda: {
    'types': Counter(),
    'sample_values': [],
    'null_count': 0,
    'total_count': 0,
    'is_list': False,
    'list_lengths': [],
    'files_present': 0
})

for i, file_path in enumerate(all_files[:num_files_to_analyze], 1):
    print(f"Processing {i}/{num_files_to_analyze}: {file_path.name}...", end='\r')
    try:
        data = load_fotmob_json(file_path)
        field_info = extract_all_fields(data)
        
        # Aggregate field information
        for field_path, info in field_info.items():
            all_field_data[field_path]['files_present'] += 1
            all_field_data[field_path]['total_count'] += info['total_count']
            all_field_data[field_path]['null_count'] += info['null_count']
            all_field_data[field_path]['is_list'] = info['is_list']
            all_field_data[field_path]['list_lengths'].extend(info['list_lengths'])
            
            for type_name in info['types']:
                all_field_data[field_path]['types'][type_name] += 1
            
            if len(all_field_data[field_path]['sample_values']) < 10:
                all_field_data[field_path]['sample_values'].extend(
                    [v for v in info['sample_values'] 
                     if v not in all_field_data[field_path]['sample_values']][:10]
                )
    except Exception as e:
        print(f"\nError processing {file_path}: {e}")

print(f"\n\n‚úì Analysis complete! Found {len(all_field_data)} unique field paths")


Analyzing 10 FotMob files...

Processing 10/10: match_4727857.json.gz.....

‚úì Analysis complete! Found 37399 unique field paths


In [34]:
# Create a comprehensive DataFrame with all field information
field_records = []

for field_path, info in sorted(all_field_data.items()):
    types_str = ', '.join(f"{k}({v})" for k, v in info['types'].most_common())
    sample_str = str(info['sample_values'][:3])[:100] if info['sample_values'] else "N/A"
    
    null_percentage = (info['null_count'] / info['total_count'] * 100) if info['total_count'] > 0 else 0
    
    list_info = ""
    if info['is_list'] and info['list_lengths']:
        avg_len = sum(info['list_lengths']) / len(info['list_lengths'])
        min_len = min(info['list_lengths'])
        max_len = max(info['list_lengths'])
        list_info = f"min:{min_len}, max:{max_len}, avg:{avg_len:.1f}"
    
    field_records.append({
        'Field Path': field_path,
        'Type(s)': types_str,
        'Files Present': info['files_present'],
        'Total Occurrences': info['total_count'],
        'Null Count': info['null_count'],
        'Null %': f"{null_percentage:.1f}%",
        'Is List': 'Yes' if info['is_list'] else 'No',
        'List Info': list_info,
        'Sample Values': sample_str
    })

df_fields = pd.DataFrame(field_records)
print(f"Created field analysis DataFrame with {len(df_fields)} rows\n")
print(df_fields.head(20))


Created field analysis DataFrame with 37399 rows

                                    Field Path           Type(s)  \
0                                                       dict(10)   
1                                         data          dict(10)   
2                                 data.content          dict(10)   
3                            data.content.buzz          null(10)   
4                             data.content.h2h  dict(9), bool(1)   
5                     data.content.h2h.matches           list(9)   
6                  data.content.h2h.matches[0]           dict(9)   
7             data.content.h2h.matches[0].away           dict(9)   
8          data.content.h2h.matches[0].away.id            str(9)   
9        data.content.h2h.matches[0].away.name            str(9)   
10        data.content.h2h.matches[0].finished           bool(9)   
11            data.content.h2h.matches[0].home           dict(9)   
12         data.content.h2h.matches[0].home.id            str(9)  

In [35]:
# Show top-level structure
print("=" * 80)
print("TOP-LEVEL STRUCTURE")
print("=" * 80)
top_level = df_fields[~df_fields['Field Path'].str.contains(r'\.|\[')]
print(top_level.to_string(index=False))
print(f"\nTotal top-level fields: {len(top_level)}")


TOP-LEVEL STRUCTURE
Field Path  Type(s)  Files Present  Total Occurrences  Null Count Null % Is List List Info                                                                              Sample Values
           dict(10)             10                 10           0   0.0%      No                                                                                                  N/A
      data dict(10)             10                 10           0   0.0%      No                                                                                                  N/A
      date  str(10)             10                 10           0   0.0%      No                                                                                         ['20251001']
  match_id  str(10)             10                 10           0   0.0%      No                                                           ['1000002056', '1000003608', '1000003609']
scraped_at  str(10)             10                 10           0   0.

In [36]:
# Explore specific sections (you can change these)
sections_to_explore = ['data.general', 'data.header', 'data.content', 'data.nav']

for section in sections_to_explore:
    print("\n" + "=" * 80)
    print(f"SECTION: {section}")
    print("=" * 80)
    section_fields = df_fields[df_fields['Field Path'].str.startswith(section)]
    
    if len(section_fields) > 0:
        # Show direct children only (not nested)
        depth = section.count('.') + 1
        direct_children = section_fields[
            section_fields['Field Path'].apply(lambda x: x.count('.') == depth and '[' not in x.split('.')[-1])
        ]
        print(f"\nDirect fields under {section}:")
        print(direct_children[['Field Path', 'Type(s)', 'Null %', 'Is List', 'Sample Values']].to_string(index=False))
        print(f"\nTotal fields in this section (including nested): {len(section_fields)}")
    else:
        print(f"No fields found for {section}")



SECTION: data.general

Direct fields under data.general:
                   Field Path  Type(s) Null % Is List                                                                                        Sample Values
        data.general.awayTeam dict(10)   0.0%      No                                                                                                  N/A
     data.general.countryCode  str(10)   0.0%      No                                                                                ['SCO', 'INT', 'IRL']
   data.general.coverageLevel  str(10)   0.0%      No                                                                           ['lower', 'xG', 'ratings']
        data.general.finished bool(10)   0.0%      No                                                                                               [True]
        data.general.homeTeam dict(10)   0.0%      No                                                                                                  N/A
        data

In [37]:
# Find fields with high null rates (potential issues)
print("\n" + "=" * 80)
print("FIELDS WITH HIGH NULL RATES (>50%)")
print("=" * 80)

high_null_fields = df_fields[
    df_fields['Null %'].str.rstrip('%').astype(float) > 50
].sort_values('Null %', ascending=False)

if len(high_null_fields) > 0:
    print(high_null_fields[['Field Path', 'Null %', 'Files Present', 'Sample Values']].head(30).to_string(index=False))
else:
    print("No fields with high null rates found!")



FIELDS WITH HIGH NULL RATES (>50%)
                                                        Field Path Null %  Files Present                                                                                        Sample Values
             data.content.matchFacts.events.events[0].overloadTime  90.0%             10                                                                                                  [0]
                      data.content.matchFacts.insights[2].playerId  85.7%              7                                                                                             [630365]
            data.content.matchFacts.insights[2].statValues[0].name  85.7%              7                                                                                    ['Simen Hestnes']
             data.content.matchFacts.events.events[0].shotmapEvent  85.7%              7                                                                                                  N/A
            da

In [38]:
# Search for specific fields by keyword
def search_fields(keyword: str, case_sensitive: bool = False):
    """Search for fields containing a keyword."""
    if case_sensitive:
        matches = df_fields[df_fields['Field Path'].str.contains(keyword)]
    else:
        matches = df_fields[df_fields['Field Path'].str.lower().str.contains(keyword.lower())]
    
    print(f"\n{'=' * 80}")
    print(f"SEARCH RESULTS FOR: '{keyword}' (Found {len(matches)} matches)")
    print('=' * 80)
    
    if len(matches) > 0:
        print(matches[['Field Path', 'Type(s)', 'Null %', 'Is List', 'Sample Values']].to_string(index=False))
    else:
        print(f"No fields found containing '{keyword}'")
    
    return matches

# Example searches - uncomment and modify as needed
# search_fields('player')
# search_fields('goal')
# search_fields('xg')
# search_fields('lineup')

print("\nUse search_fields('keyword') to find specific fields!")



Use search_fields('keyword') to find specific fields!


In [39]:
# Load a single file for detailed inspection
sample_file = all_files[0] if all_files else None

if sample_file:
    print(f"Loading detailed view of: {sample_file.name}\n")
    sample_data = load_fotmob_json(sample_file)
    
    print("=" * 80)
    print("SAMPLE FILE STRUCTURE")
    print("=" * 80)
    
    def print_structure(obj, indent=0, max_depth=3, current_depth=0):
        """Print nested structure with indentation."""
        if current_depth >= max_depth:
            return
        
        prefix = "  " * indent
        
        if isinstance(obj, dict):
            for key, value in list(obj.items())[:20]:  # Limit to first 20 keys
                if isinstance(value, dict):
                    print(f"{prefix}{key}: {{dict with {len(value)} keys}}")
                    print_structure(value, indent+1, max_depth, current_depth+1)
                elif isinstance(value, list):
                    print(f"{prefix}{key}: [list with {len(value)} items]")
                    if len(value) > 0:
                        print(f"{prefix}  Item type: {type(value[0]).__name__}")
                        if isinstance(value[0], dict):
                            print_structure(value[0], indent+2, max_depth, current_depth+1)
                else:
                    val_str = str(value)[:50]
                    print(f"{prefix}{key}: {type(value).__name__} = {val_str}")
        elif isinstance(obj, list) and len(obj) > 0:
            print(f"{prefix}[0]: {type(obj[0]).__name__}")
            print_structure(obj[0], indent+1, max_depth, current_depth+1)
    
    print_structure(sample_data)
else:
    print("No sample file available")


Loading detailed view of: match_1000002056.json.gz

SAMPLE FILE STRUCTURE
match_id: str = 1000002056
scraped_at: str = 2025-12-08T08:40:21.563836
date: str = 20251001
data: {dict with 7 keys}
  general: {dict with 16 keys}
    matchId: str = 1000002056
    matchName: str = Hibernian LFC-vs-Glasgow City_Wed, Oct 1, 2025, 18
    matchRound: str = 3
    teamColors: {dict with 4 keys}
    leagueId: int = 10791
    leagueName: str = Scottish SWPL 1
    leagueRoundName: str = 3
    parentLeagueId: int = 10791
    countryCode: str = SCO
    homeTeam: {dict with 2 keys}
    awayTeam: {dict with 2 keys}
    coverageLevel: str = lower
    matchTimeUTC: str = Wed, Oct 1, 2025, 18:35 UTC
    matchTimeUTCDate: str = 2025-10-01T18:35:00.000Z
    started: bool = True
    finished: bool = True
  header: {dict with 3 keys}
    teams: [list with 2 items]
      Item type: dict
    status: {dict with 12 keys}
    events: {dict with 4 keys}
  nav: [list with 5 items]
    Item type: str
  ongoing: bool = Fa

In [40]:
# Export full field list to CSV for easier review
output_file = "fotmob_field_analysis.csv"
df_fields.to_csv(output_file, index=False)
print(f"\n{'=' * 80}")
print(f"‚úì Full field analysis exported to: {output_file}")
print(f"{'=' * 80}")

# Summary statistics
print(f"\nüìä SUMMARY STATISTICS:")
print(f"  Total unique fields analyzed: {len(df_fields)}")
print(f"  Files analyzed: {num_files_to_analyze}")
print(f"  Fields that are lists: {len(df_fields[df_fields['Is List'] == 'Yes'])}")
print(f"  Fields with >50% null rate: {len(high_null_fields)}")
print(f"  Most common data types: {df_fields['Type(s)'].value_counts().head(5).to_dict()}")



‚úì Full field analysis exported to: fotmob_field_analysis.csv

üìä SUMMARY STATISTICS:
  Total unique fields analyzed: 37399
  Files analyzed: 10
  Fields that are lists: 870
  Fields with >50% null rate: 622
  Most common data types: {'str(1)': 13618, 'dict(1)': 12536, 'int(1)': 7011, 'list(1)': 771, 'bool(1)': 662}


In [41]:
# Deep dive into specific important sections
print("\n" + "=" * 80)
print("IMPORTANT SECTIONS BREAKDOWN")
print("=" * 80)

important_sections = {
    'Match Metadata': 'data.general',
    'Match Header': 'data.header',
    'Teams': 'data.header.teams',
    'Events': 'data.header.events',
    'Content Stats': 'data.content.stats',
    'Player Stats': 'data.content.stats.players',
    'Lineup': 'data.content.lineup',
    'Shotmap': 'data.content.shotmap',
    'Momentum': 'data.content.momentum'
}

for section_name, section_path in important_sections.items():
    section_data = df_fields[df_fields['Field Path'].str.startswith(section_path)]
    print(f"\nüìÅ {section_name} ({section_path})")
    print(f"   Total fields: {len(section_data)}")
    
    if len(section_data) > 0:
        # Count field types
        field_types = section_data['Type(s)'].value_counts().head(3)
        print(f"   Top types: {dict(field_types)}")
        
        # Show sample of direct children
        depth = section_path.count('.') + 1
        direct = section_data[
            section_data['Field Path'].apply(lambda x: x.count('.') == depth and '[' not in x.split('.')[-1])
        ]
        if len(direct) > 0:
            print(f"   Direct fields: {', '.join(direct['Field Path'].str.split('.').str[-1].head(10).tolist())}")




IMPORTANT SECTIONS BREAKDOWN

üìÅ Match Metadata (data.general)
   Total fields: 33
   Top types: {'str(10)': 19, 'dict(10)': 8, 'int(10)': 4}
   Direct fields: awayTeam, countryCode, coverageLevel, finished, homeTeam, leagueId, leagueName, leagueRoundName, matchId, matchName

üìÅ Match Header (data.header)
   Total fields: 1040
   Top types: {'str(1)': 317, 'int(1)': 255, 'null(1)': 212}
   Direct fields: events, status, teams

üìÅ Teams (data.header.teams)
   Total fields: 15
   Top types: {'str(10)': 6, 'int(10)': 4, 'dict(10)': 2}
   Direct fields: fifaRank, id, imageUrl, name, pageUrl, score, fifaRank, id, imageUrl, name

üìÅ Events (data.header.events)
   Total fields: 1000
   Top types: {'str(1)': 317, 'int(1)': 255, 'null(1)': 212}
   Direct fields: awayTeamGoals, awayTeamRedCards, homeTeamGoals, homeTeamRedCards

üìÅ Content Stats (data.content.stats)
   Total fields: 320
   Top types: {'str(8)': 152, 'dict(8)': 44, 'list(8)': 34}
   Direct fields: Periods

üìÅ Player S

# üîç FotMob Field Inspector - Interactive Guide

This notebook provides comprehensive analysis of all FotMob JSON fields. Use it to:

1. **Discover all fields** - See every field path across all match files
2. **Find missing/null data** - Identify fields with high null rates
3. **Search for specific fields** - Use `search_fields('keyword')` function
4. **Understand data types** - See what types each field contains
5. **Inspect sample values** - View example data for each field
6. **Export to CSV** - Full analysis saved to `fotmob_field_analysis.csv`

## Common Search Examples:
```python
# Search for player-related fields
search_fields('player')

# Search for goal/scoring data
search_fields('goal')

# Search for xG (expected goals) data
search_fields('xg')

# Search for lineup information
search_fields('lineup')

# Search for stats
search_fields('stats')
```


In [42]:
# Compare fields across different matches to find inconsistencies
print("\n" + "=" * 80)
print("FIELD CONSISTENCY CHECK")
print("=" * 80)

# Fields that don't appear in all files
inconsistent_fields = df_fields[df_fields['Files Present'] < num_files_to_analyze]
print(f"\nFields not present in ALL {num_files_to_analyze} files:")
print(f"Total inconsistent fields: {len(inconsistent_fields)}\n")

if len(inconsistent_fields) > 0:
    # Sort by how many files are missing this field
    inconsistent_fields_sorted = inconsistent_fields.sort_values('Files Present')
    print(inconsistent_fields_sorted[['Field Path', 'Files Present', 'Type(s)', 'Null %']].head(20).to_string(index=False))
    
    print(f"\n‚ö†Ô∏è Warning: {len(inconsistent_fields)} fields are not present in all files")
    print(f"   This could indicate:")
    print(f"   - Optional fields")
    print(f"   - Fields that depend on match type/league")
    print(f"   - Missing data that should be investigated")
else:
    print("‚úì All fields are present in all analyzed files!")



FIELD CONSISTENCY CHECK

Fields not present in ALL 10 files:
Total inconsistent fields: 36903

                                                                        Field Path  Files Present Type(s) Null %
                                    data.content.playerStats.1624169.usualPosition              1  int(1)   0.0%
        data.content.playerStats.1781532.stats[1].stats.Shots on target.stat.total              1  int(1)   0.0%
              data.content.playerStats.1781532.stats[1].stats.Shots on target.stat              1 dict(1)   0.0%
               data.content.playerStats.1781532.stats[1].stats.Shots on target.key              1  str(1)   0.0%
                   data.content.playerStats.1781532.stats[1].stats.Shots on target              1 dict(1)   0.0%
data.content.playerStats.1781532.stats[1].stats.Passes into final third.stat.value              1  int(1)   0.0%
 data.content.playerStats.1781532.stats[1].stats.Passes into final third.stat.type              1  str(1)   0.0%


In [43]:
# Interactive field browser - modify the path to explore different sections
def browse_section(path: str, max_children: int = 50):
    """Browse a specific section of the JSON structure."""
    section_fields = df_fields[df_fields['Field Path'].str.startswith(path)]
    
    print(f"\n{'=' * 80}")
    print(f"BROWSING: {path}")
    print(f"{'=' * 80}")
    
    if len(section_fields) == 0:
        print(f"‚ùå No fields found for path: {path}")
        print("\nDid you mean one of these?")
        # Find similar paths
        similar = df_fields[df_fields['Field Path'].str.contains(path.split('.')[-1], case=False)]
        if len(similar) > 0:
            print(similar['Field Path'].head(10).tolist())
        return
    
    # Show direct children
    depth = path.count('.') + 1
    direct_children = section_fields[
        section_fields['Field Path'].apply(lambda x: x.count('.') == depth and '[' not in x.split('.')[-1])
    ]
    
    print(f"\nüìä Section Overview:")
    print(f"   Total fields (including nested): {len(section_fields)}")
    print(f"   Direct children: {len(direct_children)}")
    
    if len(direct_children) > 0:
        print(f"\nüìã Direct Children ({min(len(direct_children), max_children)} shown):")
        display_cols = ['Field Path', 'Type(s)', 'Null %', 'Is List', 'List Info', 'Sample Values']
        print(direct_children[display_cols].head(max_children).to_string(index=False))
    
    # Show nested structures
    nested = section_fields[
        section_fields['Field Path'].apply(lambda x: x.count('.') > depth)
    ]
    if len(nested) > 0:
        print(f"\nüå≥ Nested Structures: {len(nested)} additional nested fields")
        # Group by parent path
        nested_parents = nested['Field Path'].apply(
            lambda x: '.'.join(x.split('.')[:depth+1])
        ).unique()
        print(f"   Nested under: {list(nested_parents[:10])}")
    
    return direct_children

# Example: Browse different sections (uncomment to use)
# browse_section('data')
# browse_section('data.general')
# browse_section('data.content.lineup')
# browse_section('data.content.shotmap')
# browse_section('data.content.stats')

print("\nüí° TIP: Use browse_section('path.to.section') to explore any section!")



üí° TIP: Use browse_section('path.to.section') to explore any section!


In [44]:
# Inspect specific match for debugging
def inspect_match(match_id: str = None, file_index: int = 0):
    """Load and inspect a specific match in detail."""
    if match_id:
        # Find file with this match_id
        target_file = None
        for f in all_files:
            if f"match_{match_id}" in f.name:
                target_file = f
                break
        if not target_file:
            print(f"‚ùå Match {match_id} not found")
            return None
    else:
        if file_index >= len(all_files):
            print(f"‚ùå File index {file_index} out of range (max: {len(all_files)-1})")
            return None
        target_file = all_files[file_index]
    
    print(f"\n{'=' * 80}")
    print(f"INSPECTING: {target_file.name}")
    print(f"{'=' * 80}")
    
    data = load_fotmob_json(target_file)
    
    # Show basic match info
    if 'data' in data and 'general' in data['data']:
        gen = data['data']['general']
        print(f"\nüìã Match Info:")
        print(f"   Match ID: {gen.get('matchId', 'N/A')}")
        print(f"   Match: {gen.get('matchName', 'N/A')}")
        print(f"   League: {gen.get('leagueName', 'N/A')}")
        print(f"   Home: {gen.get('homeTeam', {}).get('name', 'N/A')}")
        print(f"   Away: {gen.get('awayTeam', {}).get('name', 'N/A')}")
        print(f"   Status: {'Finished' if gen.get('finished') else 'Not Finished'}")
    
    # Show available top-level sections
    if 'data' in data:
        sections = list(data['data'].keys())
        print(f"\nüìÇ Available Sections: {len(sections)}")
        for section in sections:
            section_data = data['data'][section]
            if isinstance(section_data, dict):
                print(f"   ‚Ä¢ {section}: dict with {len(section_data)} keys")
            elif isinstance(section_data, list):
                print(f"   ‚Ä¢ {section}: list with {len(section_data)} items")
            else:
                print(f"   ‚Ä¢ {section}: {type(section_data).__name__}")
    
    return data

# Example usage:
# match_data = inspect_match(file_index=0)  # Inspect first file
# match_data = inspect_match(match_id='4947772')  # Inspect specific match

print("\nüí° TIP: Use inspect_match(file_index=N) or inspect_match(match_id='ID') to inspect specific matches!")



üí° TIP: Use inspect_match(file_index=N) or inspect_match(match_id='ID') to inspect specific matches!


In [45]:
# Quick troubleshooting - common issues and checks
print("\n" + "=" * 80)
print("üîß QUICK TROUBLESHOOTING CHECKS")
print("=" * 80)

# Check 1: Required fields presence
required_fields = [
    'data.general.matchId',
    'data.general.homeTeam',
    'data.general.awayTeam',
    'data.header.teams',
    'data.header.status'
]

print("\n1Ô∏è‚É£ Required Fields Check:")
for field in required_fields:
    field_data = df_fields[df_fields['Field Path'] == field]
    if len(field_data) > 0:
        null_pct = field_data.iloc[0]['Null %']
        files_present = field_data.iloc[0]['Files Present']
        print(f"   ‚úì {field}: {files_present}/{num_files_to_analyze} files, {null_pct} null")
    else:
        print(f"   ‚ùå {field}: NOT FOUND")

# Check 2: List fields with variable lengths
print("\n2Ô∏è‚É£ Variable Length Lists (potential data quality issues):")
variable_lists = df_fields[
    (df_fields['Is List'] == 'Yes') & 
    (df_fields['List Info'] != '')
]
if len(variable_lists) > 0:
    print(variable_lists[['Field Path', 'List Info', 'Files Present']].head(10).to_string(index=False))
else:
    print("   No variable length lists found")

# Check 3: Fields with multiple types (inconsistent data)
print("\n3Ô∏è‚É£ Fields with Multiple Types (data consistency check):")
multi_type_fields = df_fields[df_fields['Type(s)'].str.contains(',')]
if len(multi_type_fields) > 0:
    print(f"   Found {len(multi_type_fields)} fields with multiple types")
    print(multi_type_fields[['Field Path', 'Type(s)', 'Files Present']].head(10).to_string(index=False))
else:
    print("   ‚úì All fields have consistent types")

# Check 4: Empty lists
print("\n4Ô∏è‚É£ Frequently Empty Lists:")
empty_lists = variable_lists[variable_lists['List Info'].str.contains('min:0')]
if len(empty_lists) > 0:
    print(empty_lists[['Field Path', 'List Info', 'Files Present']].head(10).to_string(index=False))
else:
    print("   No frequently empty lists found")

print("\n" + "=" * 80)



üîß QUICK TROUBLESHOOTING CHECKS

1Ô∏è‚É£ Required Fields Check:
   ‚úì data.general.matchId: 10/10 files, 0.0% null
   ‚úì data.general.homeTeam: 10/10 files, 0.0% null
   ‚úì data.general.awayTeam: 10/10 files, 0.0% null
   ‚úì data.header.teams: 10/10 files, 0.0% null
   ‚úì data.header.status: 10/10 files, 0.0% null

2Ô∏è‚É£ Variable Length Lists (potential data quality issues):


                                                             Field Path              List Info  Files Present
                                               data.content.h2h.matches min:1, max:19, avg:8.6              9
                    data.content.h2h.matches[0].status.reason.penalties  min:2, max:2, avg:2.0              1
                    data.content.h2h.matches[1].status.reason.penalties  min:2, max:2, avg:2.0              1
                    data.content.h2h.matches[2].status.reason.penalties  min:2, max:2, avg:2.0              1
                                               data.content.h2h.summary  min:3, max:3, avg:3.0              9
                                  data.content.highlightStories.stories  min:1, max:1, avg:1.0              1
                       data.content.highlightStories.stories[0].content  min:1, max:1, avg:1.0              1
data.content.highlightStories.stories[0].content[0].restriction.allowed  min:0, max:0, avg:0.0              1
data.conte

## üìù Summary & Next Steps

This notebook has analyzed the FotMob JSON structure. Here's what you can do:

### Immediate Actions:
1. **Review the CSV export** - Open `fotmob_field_analysis.csv` for a complete field list
2. **Check high null rate fields** - Review fields with >50% null to see if this is expected
3. **Verify required fields** - Make sure critical fields exist in all files
4. **Investigate inconsistencies** - Look at fields not present in all files

### Common Use Cases:

#### Find all player-related fields:
```python
search_fields('player')
```

#### Browse lineup structure:
```python
browse_section('data.content.lineup')
```

#### Inspect a specific match:
```python
match_data = inspect_match(match_id='4947772')
# Then explore: match_data['data']['content']['shotmap']
```

#### Check for missing data:
```python
# Fields with >80% null rate
critical_nulls = df_fields[df_fields['Null %'].str.rstrip('%').astype(float) > 80]
print(critical_nulls[['Field Path', 'Null %']])
```

### Data Quality Checks:
- ‚úÖ All files load successfully
- ‚úÖ Required metadata fields present
- ‚ö†Ô∏è Check fields not present in all files
- ‚ö†Ô∏è Review fields with high null rates


In [46]:
# OPTIONAL: Analyze more files for better coverage
# Uncomment and run this cell to analyze ALL FotMob files (will take longer)

# print(f"Total files available: {len(all_files)}")
# num_to_analyze = int(input(f"How many files to analyze? (1-{len(all_files)}): "))
# Then re-run cells 3-17 with the new num_files_to_analyze value

print(f"\n{'=' * 80}")
print("üìä CURRENT ANALYSIS SCOPE")
print(f"{'=' * 80}")
print(f"Files analyzed: {num_files_to_analyze}")
print(f"Total files available: {len(all_files)}")
print(f"Coverage: {num_files_to_analyze/len(all_files)*100:.1f}%")
print(f"\nTo analyze more files, adjust 'num_files_to_analyze' in cell 3 and re-run cells 3-17")
print(f"{'=' * 80}")



üìä CURRENT ANALYSIS SCOPE
Files analyzed: 10
Total files available: 150
Coverage: 6.7%

To analyze more files, adjust 'num_files_to_analyze' in cell 3 and re-run cells 3-17


---

# üîß Field Mapping & Validation Fixes

This section identifies missing fields, naming mismatches, and provides fixes for the match processor.


In [47]:
# Define expected field mappings based on match processor
EXPECTED_FIELD_MAPPINGS = {
    'general': {
        'match_id': 'data.general.matchId',
        'match_name': 'data.general.matchName',
        'league_id': 'data.general.leagueId',
        'league_name': 'data.general.leagueName',
        'home_team_id': 'data.general.homeTeam.id',
        'home_team_name': 'data.general.homeTeam.name',
        'away_team_id': 'data.general.awayTeam.id',
        'away_team_name': 'data.general.awayTeam.name',
        'match_time_utc': 'data.general.matchTimeUTC',
        'started': 'data.general.started',
        'finished': 'data.general.finished',
    },
    'header_status': {
        'utc_time': 'data.header.status.utcTime',
        'finished': 'data.header.status.finished',
        'started': 'data.header.status.started',
        'cancelled': 'data.header.status.cancelled',
        'score_str': 'data.header.status.scoreStr',
    },
    'timeline': {
        'first_half_started': 'data.header.status.halfs.firstHalfStarted',
        'first_half_ended': 'data.header.status.halfs.firstHalfEnded',
        'second_half_started': 'data.header.status.halfs.secondHalfStarted',
        'second_half_ended': 'data.header.status.halfs.secondHalfEnded',
        'game_ended': 'data.header.status.halfs.gameEnded',
    },
    'goals': {
        'home_team_goals': 'data.header.events.homeTeamGoals',
        'away_team_goals': 'data.header.events.awayTeamGoals',
    },
    'content': {
        'match_facts': 'data.content.matchFacts',
        'stats': 'data.content.stats',
        'lineup': 'data.content.lineup',
        'shotmap': 'data.content.shotmap',
        'momentum': 'data.content.momentum',
        'player_stats': 'data.content.playerStats',
    }
}

print("‚úì Expected field mappings loaded")


‚úì Expected field mappings loaded


In [48]:
# Check which expected fields are present in actual data
def validate_field_mappings(expected_mappings: dict, actual_fields_df: pd.DataFrame) -> pd.DataFrame:
    """Check if expected fields exist in actual data."""
    results = []
    
    for category, fields in expected_mappings.items():
        for field_name, field_path in fields.items():
            # Check if field exists in actual data
            matching = actual_fields_df[actual_fields_df['Field Path'] == field_path]
            
            if len(matching) > 0:
                row = matching.iloc[0]
                status = "‚úì Present"
                null_pct = row['Null %']
                types = row['Type(s)']
                sample = str(row['Sample Values'])[:50]
            else:
                # Try to find similar fields
                similar = actual_fields_df[
                    actual_fields_df['Field Path'].str.contains(
                        field_path.split('.')[-1], case=False, na=False
                    )
                ]
                if len(similar) > 0:
                    status = "‚ö†Ô∏è Similar found"
                    null_pct = f"See: {similar.iloc[0]['Field Path']}"
                    types = ""
                    sample = ""
                else:
                    status = "‚ùå Missing"
                    null_pct = "N/A"
                    types = ""
                    sample = ""
            
            results.append({
                'Category': category,
                'Field Name': field_name,
                'Expected Path': field_path,
                'Status': status,
                'Null %': null_pct,
                'Type(s)': types,
                'Sample': sample
            })
    
    return pd.DataFrame(results)

# Run validation
validation_df = validate_field_mappings(EXPECTED_FIELD_MAPPINGS, df_fields)

print("=" * 80)
print("FIELD MAPPING VALIDATION RESULTS")
print("=" * 80)
print(f"\nTotal expected fields: {len(validation_df)}")
print(f"Present: {len(validation_df[validation_df['Status'] == '‚úì Present'])}")
print(f"Missing: {len(validation_df[validation_df['Status'] == '‚ùå Missing'])}")
print(f"Similar: {len(validation_df[validation_df['Status'].str.contains('Similar', na=False)])}")

# Show missing fields
missing = validation_df[validation_df['Status'] == '‚ùå Missing']
if len(missing) > 0:
    print(f"\n‚ö†Ô∏è MISSING FIELDS ({len(missing)}):")
    print(missing[['Category', 'Field Name', 'Expected Path']].to_string(index=False))

# Show fields with high nulls
high_null_expected = validation_df[
    validation_df['Status'] == '‚úì Present'
]
if len(high_null_expected) > 0:
    high_null_expected = high_null_expected[
        high_null_expected['Null %'].str.rstrip('%').str.replace('N/A', '0').astype(float) > 50
    ]
    if len(high_null_expected) > 0:
        print(f"\n‚ö†Ô∏è EXPECTED FIELDS WITH HIGH NULL RATES ({len(high_null_expected)}):")
        print(high_null_expected[['Category', 'Field Name', 'Null %']].to_string(index=False))


FIELD MAPPING VALIDATION RESULTS

Total expected fields: 29
Present: 29
Missing: 0
Similar: 0


In [49]:
# Generate field access helper with null-safe navigation
def generate_safe_field_accessor(field_path: str, default_value="None") -> str:
    """Generate Python code for safe field access with null handling."""
    parts = field_path.split('.')
    code = "data"
    
    for part in parts:
        if '[' in part:  # Handle list access
            base, idx = part.split('[')
            idx = idx.rstrip(']')
            code = f"{code}.get('{base}', [])"
            code = f"({code}[{idx}] if len({code}) > {idx} else None)"
        else:
            code = f"{code}.get('{part}', {{}})"
    
    # Add final default value
    return f"{code} or {default_value}"

print("=" * 80)
print("SAFE FIELD ACCESSOR EXAMPLES")
print("=" * 80)
print("\nUse these patterns for null-safe field extraction:\n")

sample_fields = [
    'data.general.matchId',
    'data.header.status.utcTime',
    'data.header.teams[0].name',
    'data.content.shotmap.shots[0].expectedGoals'
]

for field in sample_fields:
    code = generate_safe_field_accessor(field)
    print(f"# {field}")
    print(f"{code}\n")


SAFE FIELD ACCESSOR EXAMPLES

Use these patterns for null-safe field extraction:

# data.general.matchId
data.get('data', {}).get('general', {}).get('matchId', {}) or None

# data.header.status.utcTime
data.get('data', {}).get('header', {}).get('status', {}).get('utcTime', {}) or None

# data.header.teams[0].name
(data.get('data', {}).get('header', {}).get('teams', [])[0] if len(data.get('data', {}).get('header', {}).get('teams', [])) > 0 else None).get('name', {}) or None

# data.content.shotmap.shots[0].expectedGoals
(data.get('data', {}).get('content', {}).get('shotmap', {}).get('shots', [])[0] if len(data.get('data', {}).get('content', {}).get('shotmap', {}).get('shots', [])) > 0 else None).get('expectedGoals', {}) or None



In [50]:
# Detect naming mismatches and suggest corrections
def find_naming_mismatches(actual_fields_df: pd.DataFrame) -> pd.DataFrame:
    """Find common naming patterns and mismatches."""
    issues = []
    
    # Common naming issues
    naming_patterns = {
        'camelCase_to_snake_case': [
            ('matchId', 'match_id'),
            ('teamId', 'team_id'),
            ('playerId', 'player_id'),
            ('homeTeam', 'home_team'),
            ('awayTeam', 'away_team'),
            ('shotType', 'shot_type'),
            ('expectedGoals', 'expected_goals'),
        ],
        'inconsistent_abbreviations': [
            ('xG', 'expected_goals'),
            ('xGOT', 'expected_goals_on_target'),
            ('mins', 'minutes'),
        ],
        'missing_prefixes': [
            ('data.header.teams', 'Should check if it contains home/away'),
            ('data.content.stats.Periods', 'May need period prefix'),
        ]
    }
    
    for pattern_type, patterns in naming_patterns.items():
        for old_name, suggested_name in patterns:
            # Find fields containing the old pattern
            matches = actual_fields_df[
                actual_fields_df['Field Path'].str.contains(old_name, case=True, na=False)
            ]
            
            if len(matches) > 0:
                for _, row in matches.iterrows():
                    issues.append({
                        'Pattern Type': pattern_type,
                        'Field Path': row['Field Path'],
                        'Contains': old_name,
                        'Suggestion': suggested_name,
                        'Current Type': row['Type(s)'],
                        'Null %': row['Null %']
                    })
    
    return pd.DataFrame(issues)

naming_issues = find_naming_mismatches(df_fields)

print("=" * 80)
print("NAMING PATTERN ANALYSIS")
print("=" * 80)

if len(naming_issues) > 0:
    print(f"\nFound {len(naming_issues)} fields with naming patterns to review:\n")
    
    # Group by pattern type
    for pattern_type in naming_issues['Pattern Type'].unique():
        subset = naming_issues[naming_issues['Pattern Type'] == pattern_type]
        print(f"\nüìã {pattern_type} ({len(subset)} fields):")
        print(subset[['Field Path', 'Contains', 'Suggestion']].head(10).to_string(index=False))
else:
    print("\n‚úì No common naming pattern issues found")


NAMING PATTERN ANALYSIS

Found 2565 fields with naming patterns to review:


üìã camelCase_to_snake_case (1860 fields):
                                                  Field Path Contains Suggestion
                                 data.content.lineup.matchId  matchId   match_id
                             data.content.matchFacts.matchId  matchId   match_id
                                        data.general.matchId  matchId   match_id
data.content.matchFacts.events.events[0].shotmapEvent.teamId   teamId    team_id
data.content.matchFacts.events.events[1].shotmapEvent.teamId   teamId    team_id
                  data.content.matchFacts.insights[0].teamId   teamId    team_id
                  data.content.matchFacts.insights[1].teamId   teamId    team_id
                  data.content.matchFacts.insights[2].teamId   teamId    team_id
             data.content.matchFacts.playerOfTheMatch.teamId   teamId    team_id
 data.content.matchFacts.topPlayers.awayTopPlayers[0].teamId   teamId

In [51]:
# Create validation helper class
validation_helper_code = '''
"""
FotMob Field Validation Helper
Auto-generated based on actual API structure
"""

from typing import Any, Dict, Optional, List
from pydantic import BaseModel, Field, field_validator, model_validator


class SafeFieldExtractor:
    """Helper class for safe field extraction from FotMob API responses."""
    
    @staticmethod
    def safe_get(data: Dict, path: str, default: Any = None) -> Any:
        """
        Safely extract nested field from dictionary using dot notation.
        
        Args:
            data: Source dictionary
            path: Dot-separated path (e.g., 'general.matchId')
            default: Default value if path not found
        
        Returns:
            Value at path or default
        """
        keys = path.split('.')
        current = data
        
        for key in keys:
            if isinstance(current, dict):
                current = current.get(key)
                if current is None:
                    return default
            elif isinstance(current, list) and key.isdigit():
                idx = int(key)
                current = current[idx] if idx < len(current) else None
                if current is None:
                    return default
            else:
                return default
        
        return current if current is not None else default
    
    @staticmethod
    def safe_get_nested(data: Dict, *keys, default: Any = None) -> Any:
        """
        Safely extract nested field using multiple keys.
        
        Example:
            safe_get_nested(data, 'general', 'homeTeam', 'id', default=0)
        """
        current = data
        for key in keys:
            if isinstance(current, dict):
                current = current.get(key)
                if current is None:
                    return default
            else:
                return default
        return current if current is not None else default


class FieldValidator:
    """Validates FotMob API responses for required fields."""
    
    REQUIRED_FIELDS = {
        'general.matchId': int,
        'general.homeTeam.id': int,
        'general.awayTeam.id': int,
        'header.status.finished': bool,
    }
    
    OPTIONAL_FIELDS = {
        'content.shotmap': dict,
        'content.lineup': dict,
        'content.playerStats': dict,
    }
    
    @classmethod
    def validate_response(cls, data: Dict[str, Any]) -> tuple[bool, List[str]]:
        """
        Validate API response has required fields.
        
        Returns:
            (is_valid, list_of_errors)
        """
        errors = []
        extractor = SafeFieldExtractor()
        
        for field_path, expected_type in cls.REQUIRED_FIELDS.items():
            value = extractor.safe_get(data, field_path)
            
            if value is None:
                errors.append(f"Missing required field: {field_path}")
            elif not isinstance(value, expected_type):
                errors.append(
                    f"Invalid type for {field_path}: "
                    f"expected {expected_type.__name__}, got {type(value).__name__}"
                )
        
        return len(errors) == 0, errors
    
    @classmethod
    def validate_and_report(cls, data: Dict[str, Any]) -> None:
        """Validate and print report."""
        is_valid, errors = cls.validate_response(data)
        
        if is_valid:
            print("‚úì Validation passed: All required fields present")
        else:
            print(f"‚ùå Validation failed with {len(errors)} errors:")
            for error in errors:
                print(f"  - {error}")


# Example usage:
"""
from fotmob_validation_helper import SafeFieldExtractor, FieldValidator

# Load match data
with open('match_data.json') as f:
    data = json.load(f)

# Validate
FieldValidator.validate_and_report(data)

# Safe extraction
extractor = SafeFieldExtractor()
match_id = extractor.safe_get(data, 'general.matchId', default=0)
home_team = extractor.safe_get_nested(data, 'general', 'homeTeam', 'name', default='Unknown')
"""
'''

# Write to file
with open('fotmob_validation_helper.py', 'w', encoding='utf-8') as f:
    f.write(validation_helper_code)

print("=" * 80)
print("‚úì Generated: fotmob_validation_helper.py")
print("=" * 80)
print("\nThis file contains:")
print("  - SafeFieldExtractor: For null-safe field access")
print("  - FieldValidator: For validating API responses")
print("\nYou can import and use in your match processor!")
print("=" * 80)


‚úì Generated: fotmob_validation_helper.py

This file contains:
  - SafeFieldExtractor: For null-safe field access
  - FieldValidator: For validating API responses

You can import and use in your match processor!


In [52]:
# Test the validation helper on actual data
print("=" * 80)
print("TESTING VALIDATION HELPER ON ACTUAL DATA")
print("=" * 80)

# Load helper
exec(validation_helper_code)

# Test on sample files
test_results = []
for i, file_path in enumerate(all_files[:5], 1):
    print(f"\nTesting file {i}: {file_path.name}")
    data = load_fotmob_json(file_path)
    
    # Extract data section
    if 'data' in data:
        test_data = data['data']
    else:
        test_data = data
    
    # Validate
    is_valid, errors = FieldValidator.validate_response(test_data)
    
    test_results.append({
        'file': file_path.name,
        'valid': is_valid,
        'errors': len(errors),
        'error_list': errors
    })
    
    if is_valid:
        print(f"  ‚úì Valid")
    else:
        print(f"  ‚ùå Invalid ({len(errors)} errors):")
        for error in errors[:3]:
            print(f"    - {error}")

print(f"\n{'=' * 80}")
print("VALIDATION SUMMARY")
print(f"{'=' * 80}")
print(f"Files tested: {len(test_results)}")
print(f"Valid: {sum(1 for r in test_results if r['valid'])}")
print(f"Invalid: {sum(1 for r in test_results if not r['valid'])}")

# Show common errors
all_errors = []
for result in test_results:
    all_errors.extend(result['error_list'])

if all_errors:
    print(f"\nMost common errors:")
    from collections import Counter
    error_counts = Counter(all_errors)
    for error, count in error_counts.most_common(5):
        print(f"  {count}x: {error}")


TESTING VALIDATION HELPER ON ACTUAL DATA

Testing file 1: match_1000002056.json.gz
  ‚ùå Invalid (1 errors):
    - Invalid type for general.matchId: expected int, got str

Testing file 2: match_1000003608.json.gz
  ‚ùå Invalid (1 errors):
    - Invalid type for general.matchId: expected int, got str

Testing file 3: match_1000003609.json.gz
  ‚ùå Invalid (1 errors):
    - Invalid type for general.matchId: expected int, got str

Testing file 4: match_4692335.json.gz
  ‚ùå Invalid (1 errors):
    - Invalid type for general.matchId: expected int, got str

Testing file 5: match_4693111.json.gz
  ‚ùå Invalid (1 errors):
    - Invalid type for general.matchId: expected int, got str

VALIDATION SUMMARY
Files tested: 5
Valid: 0
Invalid: 5

Most common errors:
  5x: Invalid type for general.matchId: expected int, got str


In [53]:
# Generate field mapping corrections for match processor
def generate_processor_fixes(validation_df: pd.DataFrame, actual_fields_df: pd.DataFrame) -> str:
    """Generate code fixes for the match processor based on validation results."""
    
    fixes = []
    fixes.append("# Suggested fixes for match_processor.py\n")
    fixes.append("# Generated based on actual FotMob API structure\n\n")
    
    # Missing fields - suggest alternatives or defaults
    missing = validation_df[validation_df['Status'] == '‚ùå Missing']
    if len(missing) > 0:
        fixes.append("# === MISSING FIELDS - ADD DEFAULT HANDLING ===\n")
        for _, row in missing.iterrows():
            field_name = row['Field Name']
            expected_path = row['Expected Path']
            
            # Suggest fix
            fixes.append(f"# Missing: {expected_path}")
            fixes.append(f'# Add: {field_name} = response_data.get("{expected_path}", None)\n')
    
    # High null fields - suggest optional handling
    high_null = validation_df[
        validation_df['Status'] == '‚úì Present'
    ]
    
    if len(high_null) > 0:
        try:
            high_null = high_null[
                pd.to_numeric(high_null['Null %'].str.rstrip('%').str.replace('N/A', '0'), errors='coerce') > 50
            ]
            
            if len(high_null) > 0:
                fixes.append("\n# === HIGH NULL RATE FIELDS - MAKE OPTIONAL ===\n")
                for _, row in high_null.iterrows():
                    field_name = row['Field Name']
                    null_pct = row['Null %']
                    fixes.append(f"# {field_name}: {null_pct} null - consider Optional type\n")
        except:
            pass
    
    # Generate safe extraction patterns
    fixes.append("\n# === SAFE EXTRACTION PATTERNS ===\n")
    fixes.append("""
def safe_extract_nested(data: dict, *keys, default=None):
    '''Safely extract nested dictionary values.'''
    current = data
    for key in keys:
        if isinstance(current, dict):
            current = current.get(key)
            if current is None:
                return default
        else:
            return default
    return current if current is not None else default

# Usage examples:
# match_id = safe_extract_nested(response_data, 'general', 'matchId', default=None)
# home_team = safe_extract_nested(response_data, 'general', 'homeTeam', 'name', default='Unknown')
""")
    
    return '\n'.join(fixes)

# Generate fixes
processor_fixes = generate_processor_fixes(validation_df, df_fields)

# Save to file
with open('match_processor_fixes.py', 'w', encoding='utf-8') as f:
    f.write(processor_fixes)

print("=" * 80)
print("‚úì Generated: match_processor_fixes.py")
print("=" * 80)
print("\nPreview of suggested fixes:\n")
print(processor_fixes[:1000])
print("\n... (see full file for all suggestions)")
print("=" * 80)


‚úì Generated: match_processor_fixes.py

Preview of suggested fixes:

# Suggested fixes for match_processor.py

# Generated based on actual FotMob API structure



# === SAFE EXTRACTION PATTERNS ===


def safe_extract_nested(data: dict, *keys, default=None):
    '''Safely extract nested dictionary values.'''
    current = data
    for key in keys:
        if isinstance(current, dict):
            current = current.get(key)
            if current is None:
                return default
        else:
            return default
    return current if current is not None else default

# Usage examples:
# match_id = safe_extract_nested(response_data, 'general', 'matchId', default=None)
# home_team = safe_extract_nested(response_data, 'general', 'homeTeam', 'name', default='Unknown')


... (see full file for all suggestions)


In [54]:
# Export comprehensive field report
comprehensive_report = {
    'validation_results': validation_df,
    'all_fields': df_fields,
    'naming_issues': naming_issues if len(naming_issues) > 0 else pd.DataFrame(),
    'test_results': pd.DataFrame(test_results) if test_results else pd.DataFrame()
}

# Save all reports to Excel
with pd.ExcelWriter('fotmob_field_report.xlsx', engine='openpyxl') as writer:
    comprehensive_report['validation_results'].to_excel(writer, sheet_name='Validation', index=False)
    comprehensive_report['all_fields'].to_excel(writer, sheet_name='All Fields', index=False)
    if len(comprehensive_report['naming_issues']) > 0:
        comprehensive_report['naming_issues'].to_excel(writer, sheet_name='Naming Issues', index=False)
    if len(comprehensive_report['test_results']) > 0:
        comprehensive_report['test_results'].to_excel(writer, sheet_name='Test Results', index=False)

print("=" * 80)
print("‚úì COMPREHENSIVE REPORT GENERATED")
print("=" * 80)
print("\nGenerated files:")
print("  1. fotmob_field_analysis.csv - All field details")
print("  2. fotmob_field_report.xlsx - Multi-sheet Excel report")
print("  3. fotmob_validation_helper.py - Python validation utilities")
print("  4. match_processor_fixes.py - Suggested code fixes")
print("\n" + "=" * 80)
print("SUMMARY OF ISSUES FOUND")
print("=" * 80)

# Summary statistics
total_expected = len(validation_df)
present = len(validation_df[validation_df['Status'] == '‚úì Present'])
missing = len(validation_df[validation_df['Status'] == '‚ùå Missing'])
similar = len(validation_df[validation_df['Status'].str.contains('Similar', na=False)])

print(f"\nüìä Field Coverage:")
print(f"  Expected fields: {total_expected}")
print(f"  Present: {present} ({present/total_expected*100:.1f}%)")
print(f"  Missing: {missing} ({missing/total_expected*100:.1f}%)")
print(f"  Similar found: {similar}")

print(f"\nüìã Naming Issues:")
print(f"  Potential naming mismatches: {len(naming_issues)}")

print(f"\n‚úÖ Validation Tests:")
if test_results:
    valid_count = sum(1 for r in test_results if r['valid'])
    print(f"  Files passed: {valid_count}/{len(test_results)}")
    print(f"  Files failed: {len(test_results) - valid_count}/{len(test_results)}")
else:
    print(f"  No tests run")

print("\n" + "=" * 80)
print("Next Steps:")
print("  1. Review fotmob_field_report.xlsx for detailed analysis")
print("  2. Apply fixes from match_processor_fixes.py")
print("  3. Use fotmob_validation_helper.py in your code")
print("  4. Run validation on all files before processing")
print("=" * 80)


‚úì COMPREHENSIVE REPORT GENERATED

Generated files:
  1. fotmob_field_analysis.csv - All field details
  2. fotmob_field_report.xlsx - Multi-sheet Excel report
  3. fotmob_validation_helper.py - Python validation utilities
  4. match_processor_fixes.py - Suggested code fixes

SUMMARY OF ISSUES FOUND

üìä Field Coverage:
  Expected fields: 29
  Present: 29 (100.0%)
  Missing: 0 (0.0%)
  Similar found: 0

üìã Naming Issues:
  Potential naming mismatches: 2565

‚úÖ Validation Tests:
  Files passed: 0/5
  Files failed: 5/5

Next Steps:
  1. Review fotmob_field_report.xlsx for detailed analysis
  2. Apply fixes from match_processor_fixes.py
  3. Use fotmob_validation_helper.py in your code
  4. Run validation on all files before processing
