# Adding paths and files

In [22]:
import os
import glob
import polars as pl

# Specify the directory where your CSV files are located
directory = r'D:\github\localt20\data\t20s_csv2'

# Use glob to find all CSV files in the specified directory
info_files = glob.glob(os.path.join(directory, '*_info.csv'))
all_files = glob.glob(os.path.join(directory,'*.csv'))
delivery_files = [file for file in all_files if '_info' not in file]

matches=[]
deliveries=[]
# Print the list of CSV files
for info_file in info_files:
    matches.append(info_file.split('\\')[-1])
for delivery in delivery_files:
    if '_info' not in delivery:
        deliveries.append(delivery.split('\\')[-1])

# Extracting player ids

In [23]:
import polars as pl
import os

# Initialize an empty list to hold the DataFrames and another for injured matches
dataframes = []
injured_matches = []

# Loop through the matches list
for match in matches:
    # Extract the match_id from the filename (e.g., '1001349_info.csv' -> '1001349')
    match_id = match.split('_')[0]
    
    # Construct the full file path
    file_path = os.path.join(directory, match)
    
    try:
        # Try to load the CSV and rename the columns
        df = pl.read_csv(file_path, skip_rows=43, has_header=False)
        df = df.rename({
            'column_1': 'info',
            'column_2': 'registry',
            'column_3': 'people',
            'column_4': 'player',
            'column_5': 'player_id'
        })
        
        # Add the match_id as a new column to the DataFrame
        df = df.with_columns(pl.lit(match_id).alias('match_id').cast(pl.Int64))
        
        # Append the DataFrame to the list
        dataframes.append(df)
    
    except Exception as e:
        # If an error occurs, print the error and append the match to injured_matches
        injured_matches.append(match)

# Combine all DataFrames into one DataFrame if there are any successful matches
if dataframes:
    playerid_df = pl.concat(dataframes)
    print("Combined DataFrame:")
else:
    print("No valid dataframes to combine.")

# Show the list of files that couldn't be processed
print(f"Injured matches (failed to process): {len(injured_matches),injured_matches}")
playerid_df

Combined DataFrame:
Injured matches (failed to process): (73, ['1040487_info.csv', '1043991_info.csv', '1072316_info.csv', '1089777_info.csv', '1119501_info.csv', '1144174_info.csv', '1144990_info.csv', '1146723_info.csv', '1146725_info.csv', '1150539_info.csv', '1188622_info.csv', '1216416_info.csv', '1223952_info.csv', '1233954_info.csv', '1233980_info.csv', '1244848_info.csv', '1249207_info.csv', '1260097_info.csv', '1263471_info.csv', '1298154_info.csv', '1298163_info.csv', '1298169_info.csv', '1298170_info.csv', '1322004_info.csv', '1335788_info.csv', '1335790_info.csv', '1335802_info.csv', '1336076_info.csv', '1336080_info.csv', '1338057_info.csv', '1355720_info.csv', '1362817_info.csv', '1377016_info.csv', '1382164_info.csv', '1384634_info.csv', '1387598_info.csv', '1388198_info.csv', '1388204_info.csv', '1388215_info.csv', '1388225_info.csv', '1392803_info.csv', '1398278_info.csv', '1415734_info.csv', '1415744_info.csv', '1415750_info.csv', '1415752_info.csv', '1416079_info.csv

info,registry,people,player,player_id,match_id
str,str,str,str,str,i64
"""info""","""registry""","""people""","""A Zampa""","""14f96089""",1001349
"""info""","""registry""","""people""","""AJ Finch""","""b8d490fd""",1001349
"""info""","""registry""","""people""","""AJ Turner""","""ff1e12a0""",1001349
"""info""","""registry""","""people""","""AJ Tye""","""7c7d63a2""",1001349
"""info""","""registry""","""people""","""B Stanlake""","""6834d1f2""",1001349
…,…,…,…,…,…
"""info""","""registry""","""people""","""SMSM Senanayake""","""4c4fa80b""",995469
"""info""","""registry""","""people""","""SS Pathirana""","""753c95b9""",995469
"""info""","""registry""","""people""","""TM Dilshan""","""5bdcdb72""",995469
"""info""","""registry""","""people""","""TM Head""","""12b610c2""",995469


# Extracting player country

In [25]:
import polars as pl
import os


# Initialize a list to hold DataFrames
dataframes = []
non_injured_matches = [match for match in matches if match not in injured_matches]

# Loop through the match files
for match in non_injured_matches:
    # Extract the match_id from the filename (e.g., '1001349_info.csv' -> '1001349')
    match_id = match.split('_')[0]
    
    # Construct the full file path
    file_path = os.path.join(directory, match)
    
    try:
        # Read the CSV, skip 21 rows, and take the first 22 rows
        df = pl.read_csv(file_path, truncate_ragged_lines=True, skip_rows=21, has_header=False)[:22]
        
        # Rename the columns (adjust based on your schema)
        df = df.rename({
            'column_1': 'info',
            'column_2': 'registry',
            'column_3': 'country',
            'column_4': 'player',
            # You mentioned 'match_id' should be added, not present in file
        })
        
        # Add the match_id as a new column to the DataFrame
        df = df.with_columns(pl.lit(match_id).alias('match_id').cast(pl.Int64))
        
        # Select only the relevant columns (modify as needed)
        df = df.select(['player', 'country', 'match_id'])
        
        # Append the processed DataFrame to the list
        dataframes.append(df)
    
    except Exception as e:
        # Handle errors during processing
        print(f"Error processing file {match}: {e}")

# Combine all DataFrames if there are any valid ones
if dataframes:
    final_df = pl.concat(dataframes)
    print("Combined DataFrame:")
    print(final_df)
else:
    print("No valid dataframes to combine.")

Combined DataFrame:
shape: (81_950, 3)
┌─────────────────┬───────────┬──────────┐
│ player          ┆ country   ┆ match_id │
│ ---             ┆ ---       ┆ ---      │
│ str             ┆ str       ┆ i64      │
╞═════════════════╪═══════════╪══════════╡
│ AJ Finch        ┆ Australia ┆ 1001349  │
│ M Klinger       ┆ Australia ┆ 1001349  │
│ TM Head         ┆ Australia ┆ 1001349  │
│ MC Henriques    ┆ Australia ┆ 1001349  │
│ AJ Turner       ┆ Australia ┆ 1001349  │
│ …               ┆ …         ┆ …        │
│ NLTC Perera     ┆ Sri Lanka ┆ 995469   │
│ SS Pathirana    ┆ Sri Lanka ┆ 995469   │
│ S Prasanna      ┆ Sri Lanka ┆ 995469   │
│ SMSM Senanayake ┆ Sri Lanka ┆ 995469   │
│ RAS Lakmal      ┆ Sri Lanka ┆ 995469   │
└─────────────────┴───────────┴──────────┘


# Combining players name, Id, Country

In [26]:
df = final_df.join(playerid_df, on=['match_id','player'], how='inner').drop('people_right','info','registry').select(['match_id','player','country','player_id'])
df

match_id,player,country,player_id
i64,str,str,str
1001349,"""A Zampa""","""Australia""","""14f96089"""
1001349,"""AJ Finch""","""Australia""","""b8d490fd"""
1001349,"""AJ Turner""","""Australia""","""ff1e12a0"""
1001349,"""AJ Tye""","""Australia""","""7c7d63a2"""
1001349,"""B Stanlake""","""Australia""","""6834d1f2"""
…,…,…,…
995469,"""SMSM Senanayake""","""Sri Lanka""","""4c4fa80b"""
995469,"""SS Pathirana""","""Sri Lanka""","""753c95b9"""
995469,"""TM Dilshan""","""Sri Lanka""","""5bdcdb72"""
995469,"""TM Head""","""Australia""","""12b610c2"""


In [27]:
df.write_csv("../../processedData/Matchplayers.csv")

# Individual player's data

In [29]:
players = df.drop('match_id').select('player','country','player_id').unique()
players

player,country,player_id
str,str,str
"""E Frimpong""","""Ghana""","""3c13fc3b"""
"""L Rika""","""Fiji""","""fe12944e"""
"""B George""","""Malta""","""1bdbf53b"""
"""WB Rankin""","""Ireland""","""29b89ae8"""
"""V Phiri""","""Malawi""","""8ffa1b3c"""
…,…,…
"""B Frank""","""Nigeria""","""040206a2"""
"""XM Marshall""","""West Indies""","""ffb504b1"""
"""T Vanuarua""","""Cook Islands""","""1f8b4fec"""
"""Nary Thapa""","""Nepal""","""d24e69f5"""


In [30]:
players.write_csv("../../processedData/Players.csv")