# Tennis Game

## Data Reading 

### Step 1
Manulay extracting 202405.zip 

### Step 2

Extracting 31 zip files inside the main directory in another location 

In [None]:
import zipfile
import os

zip_files_directory = '../data/rawTest'

extract_to_directory = '../data/raw'

# Iterate over all files in the zip files directory
for filename in os.listdir(zip_files_directory):
    if filename.endswith('.zip'):
        zip_file_path = os.path.join(zip_files_directory, filename)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Create a sub-directory for each zip file based on the zip file name
            sub_directory = os.path.join(extract_to_directory, os.path.splitext(filename)[0])
            if not os.path.exists(sub_directory):
                os.makedirs(sub_directory)
            zip_ref.extractall(sub_directory)
            #print(f"Extracted {filename} to {sub_directory}")

print("All files have been extracted.")

### Step 3
Creating a dictionary of empty dataframes, 
importing required libraries, and 
get access to the parquet files 

In [None]:
import os
import pandas as pd
import pyarrow.parquet as pq

# Directory containing the extracted files
extracted_files_directory = '../data/raw'

# Initialize empty DataFrames
data_frames = {
    'MatchEventInfo': pd.DataFrame(),
    'PeriodInfo': pd.DataFrame(),
    'MatchVotesInfo': pd.DataFrame(),
    'MatchTournamentInfo': pd.DataFrame(),
    'MatchSeasonInfo': pd.DataFrame(),
    'MatchRoundInfo': pd.DataFrame(),
    'MatchVenueInfo': pd.DataFrame(),
    'MatchHomeTeamInfo': pd.DataFrame(),
    'MatchAwayTeamInfo': pd.DataFrame(),
    'MatchHomeScoreInfo': pd.DataFrame(),
    'MatchAwayScoreInfo': pd.DataFrame(),
    'MatchTimeInfo': pd.DataFrame(),
    'GameInfo': pd.DataFrame(),
    'OddsInfo': pd.DataFrame(),
    'PowerInfo': pd.DataFrame()
}

# Function to process Parquet files and append data to the corresponding DataFrame
def process_parquet_file(parquet_file_path, data_frames):
    # Extract the file name
    file_name = os.path.basename(parquet_file_path)
    
    # Check which DataFrame to append to based on the file name prefix
    if file_name.startswith('event'):
        df_key = 'MatchEventInfo'
    elif file_name.startswith('pbp'):
        df_key = 'PeriodInfo'
    elif file_name.startswith('votes'):
        df_key = 'MatchVotesInfo'
    elif file_name.startswith('tournament'):
        df_key = 'MatchTournamentInfo'
    elif file_name.startswith('season'):
        df_key = 'MatchSeasonInfo'
    elif file_name.startswith('round'):
        df_key = 'MatchRoundInfo'
    elif file_name.startswith('venue'):
        df_key = 'MatchVenueInfo'
    elif file_name.startswith('home_team_1'):
        df_key = 'MatchHomeTeamInfo'
    elif file_name.startswith('away_team_1'):
        df_key = 'MatchAwayTeamInfo'
    elif file_name.startswith('home_team_score'):
        df_key = 'MatchHomeScoreInfo'
    elif file_name.startswith('away_team_score'):
        df_key = 'MatchAwayScoreInfo'
    elif file_name.startswith('time'):
        df_key = 'MatchTimeInfo'
    elif file_name.startswith('statistics'):
        df_key = 'GameInfo'
    elif file_name.startswith('odds'):
        df_key = 'OddsInfo'
    elif file_name.startswith('power'):
        df_key = 'PowerInfo'
    else:
        print(f"Unknown file prefix for {file_name}, skipping file.")
        return
    
    df = pd.read_parquet(parquet_file_path)

    data_frames[df_key] = pd.concat([data_frames[df_key], df], ignore_index=True)
    print(f"Appended data from {file_name} to {df_key}")

# Traverse through the extracted files directory
for root, dirs, files in os.walk(extracted_files_directory):
    for dir_name in dirs:
        if dir_name.endswith('_parquet'):
            parquet_folder_path = os.path.join(root, dir_name)
            for root2, dirs2, files2 in os.walk(parquet_folder_path):
                for file_name in files2:
                    if file_name.endswith('.parquet'):
                        parquet_file_path = os.path.join(root2, file_name)
                        process_parquet_file(parquet_file_path, data_frames)
    

# print("All Parquet files have been processed.")

# Verify the data
for key, df in data_frames.items():
    print(f"{key} DataFrame shape: {df.shape}")


### Step 4
Save all  dataframes into csv format 

In [None]:
output_directory='../data/processed'
for key, unique_data in data_frames.items():
    file_path = os.path.join(output_directory, f"{key}.csv")
    unique_data.to_csv(file_path, index=False)
    print(f"Saved {key} to {file_path}")