# Adding paths and files

In [13]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), '..', '..'))
import config,utils

# Specify the directory where your CSV files are located
directory = os.path.join(config.RAW_DATA_DIR, 't20s_csv2')
print(directory)
from hdfs import InsecureClient
client = InsecureClient(config.HDFS_URL, user=config.HDFS_USER)

all_files = client.list(directory)
info_files = [os.path.join(directory, file) for file in all_files if 'info' in file]

matches=[]
# Print the list of CSV files
for info_file in info_files:
    matches.append(info_file.split('/')[-1])

match_ids=[]
for csv_file in matches:
    match_ids.append(csv_file.split('_')[0])

/usr/ravi/t20/data/1_rawData/t20s_csv2
[[34m2024-11-19T00:52:54.399+0530[0m] {[34mclient.py:[0m192} INFO[0m - Instantiated <InsecureClient(url='http://192.168.245.142:9870')>.[0m
[[34m2024-11-19T00:52:54.400+0530[0m] {[34mclient.py:[0m1116} INFO[0m - Listing '/usr/ravi/t20/data/1_rawData/t20s_csv2'.[0m


In [16]:
import pandas as pd
import io
def process_single_match(client, match_id: str) -> pd.DataFrame:
    """
    Process a single match file and return the processed DataFrame or None on failure.

    Args:
        client: The HDFS client for file operations.
        match_id: Unique ID for each match.

    Returns:
        DataFrame of the processed match or None if an error occurs.
    """
    try:
        with client.read(os.path.join(config.RAW_DATA_DIR, 't20s_csv2', f'{match_id}_info.csv')) as reader:
            data = reader.read()
        match_df = pd.read_csv(io.StringIO(data.decode('utf-8')), header=None, names=['col1', 'attributes', 'values', 'players', 'code'])
        match_df = match_df.drop(columns=['col1', 'players', 'code']).T
        match_df.columns = match_df.iloc[0]
        match_df['match_id'] = match_id
        match_df = match_df[['match_id', 'team', 'team', 'gender', 'season', 'winner']].drop('attributes')
        match_df = match_df.reset_index(drop=True)
        return match_df
    except Exception as e:
        print(f"Error processing match {match_id}: {e}")
        return None

In [20]:
from tqdm import tqdm
import concurrent.futures
import logging
logging.basicConfig(level=logging.ERROR)
recalculated_matches = []

# Set the logging level for the HDFS client to WARNING
logging.getLogger('hdfs.client').setLevel(logging.CRITICAL)
injured_matches = []

# Process each match concurrently using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {
        executor.submit(process_single_match, client, match_id): match_id
        for match_id in match_ids
    }

    # Gather results with tqdm for progress tracking
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing Matches"):
        match = futures[future]
        try:
            result = future.result()
            if result is not None:
                recalculated_matches.append(result)
            else:
                injured_matches.append(match)
        except Exception as e:
            print(f"Error in future for match {match}: {e}")
            injured_matches.append(match)

if not recalculated_matches:
    print('No matches were successfully processed.')
    raise ValueError('No matches were successfully processed.')

matches_data = pd.concat(recalculated_matches, ignore_index=True)
matches_data

Processing Matches:   1%|          | 46/3800 [00:00<00:18, 207.31it/s]

Error processing match 1041617: "['winner'] not in index"
Error processing match 1115799: "['winner'] not in index"


Processing Matches:   3%|▎         | 106/3800 [00:00<00:20, 179.52it/s]

Error processing match 1123209: "['winner'] not in index"


Processing Matches:   4%|▍         | 162/3800 [00:00<00:22, 163.79it/s]

Error processing match 1141835: "['winner'] not in index"
Error processing match 1142504: "['winner'] not in index"
Error processing match 1144172: "['winner'] not in index"
Error processing match 1144991: "['winner'] not in index"


Processing Matches:   6%|▌         | 217/3800 [00:01<00:21, 165.92it/s]

Error processing match 1147733: "['winner'] not in index"


Processing Matches:   7%|▋         | 269/3800 [00:01<00:22, 155.87it/s]

Error processing match 1157710: "['winner'] not in index"


Processing Matches:   8%|▊         | 319/3800 [00:01<00:22, 152.99it/s]

Error processing match 1172470: "['winner'] not in index"


Processing Matches:   9%|▉         | 355/3800 [00:02<00:21, 161.65it/s]

Error processing match 1173066: "['winner'] not in index"
Error processing match 1176795: "['winner'] not in index"
Error processing match 1177484: "['winner'] not in index"


Processing Matches:  10%|▉         | 372/3800 [00:02<00:21, 161.62it/s]



Processing Matches:  11%|█         | 406/3800 [00:02<00:21, 154.39it/s]

Error processing match 1183527: "['winner'] not in index"Error processing match 1183544: "['winner'] not in index"



Processing Matches:  11%|█         | 424/3800 [00:02<00:21, 158.49it/s]

Error processing match 1185187: "['winner'] not in index"


Processing Matches:  12%|█▏        | 440/3800 [00:02<00:22, 148.98it/s]

Error processing match 1186492: "['winner'] not in index"
Error processing match 1187669: "['winner'] not in index"
Error processing match 1187680: "['winner'] not in index"
Error processing match 1187679: "['winner'] not in index"
Error processing match 1188380: "['winner'] not in index"


Processing Matches:  12%|█▏        | 458/3800 [00:02<00:22, 151.79it/s]

Error processing match 1190607: "['winner'] not in index"


Processing Matches:  13%|█▎        | 493/3800 [00:03<00:21, 156.00it/s]

Error processing match 1190751: "['winner'] not in index"


Processing Matches:  14%|█▍        | 545/3800 [00:03<00:20, 159.15it/s]

Error processing match 1197398: "['winner'] not in index"


Processing Matches:  15%|█▌        | 578/3800 [00:03<00:21, 151.62it/s]

Error processing match 1198244: "['winner'] not in index"


Processing Matches:  18%|█▊        | 668/3800 [00:04<00:19, 163.92it/s]

Error processing match 1203677: "['winner'] not in index"


Processing Matches:  19%|█▉        | 740/3800 [00:04<00:17, 171.89it/s]

Error processing match 1233956: "['winner'] not in index"


Processing Matches:  20%|██        | 776/3800 [00:04<00:18, 165.78it/s]

Error processing match 1249240: "['winner'] not in index"
Error processing match 1257949: "['winner'] not in index"


Processing Matches:  22%|██▏       | 832/3800 [00:05<00:17, 171.41it/s]

Error processing match 1263164: "['winner'] not in index"
Error processing match 1263166: "['winner'] not in index"
Error processing match 1263573: "['winner'] not in index"
Error processing match 1263167: "['winner'] not in index"
Error processing match 1263621: "['winner'] not in index"
Error processing match 1263472: "['winner'] not in index"
Error processing match 1267311: "['winner'] not in index"


Processing Matches:  23%|██▎       | 888/3800 [00:05<00:16, 175.97it/s]

Error processing match 1273415: "['winner'] not in index"
Error processing match 1278691: "['winner'] not in index"
Error processing match 1286674: "['winner'] not in index"
Error processing match 1298152: "['winner'] not in index"


Processing Matches:  36%|███▌      | 1374/3800 [00:06<00:08, 277.83it/s] 

Error processing match 1317149: "['winner'] not in index"
Error processing match 1317635: "['winner'] not in index"
Error processing match 1317488: "['winner'] not in index"
Error processing match 1317639: "['winner'] not in index"
Error processing match 1320193: "['winner'] not in index"
Error processing match 1320192: "['winner'] not in index"
Error processing match 1320210: "['winner'] not in index"


Processing Matches:  41%|████      | 1542/3800 [00:07<00:11, 204.73it/s]

Error processing match 1322277: "['winner'] not in index"
Error processing match 1322362: "['winner'] not in index"


Processing Matches:  43%|████▎     | 1634/3800 [00:08<00:12, 174.45it/s]

Error processing match 1334418: "['winner'] not in index"


Processing Matches:  44%|████▍     | 1689/3800 [00:08<00:12, 173.64it/s]

Error processing match 1336982: "['winner'] not in index"
Error processing match 1339617: "['winner'] not in index"


Processing Matches:  45%|████▌     | 1727/3800 [00:08<00:12, 166.77it/s]

Error processing match 1343742: "['winner'] not in index"
Error processing match 1343746: "['winner'] not in index"
Error processing match 1343743: "['winner'] not in index"
Error processing match 1343747: "['winner'] not in index"
Error processing match 1343758: "['winner'] not in index"
Error processing match 1343767: "['winner'] not in index"


Processing Matches:  46%|████▋     | 1763/3800 [00:09<00:12, 164.53it/s]

Error processing match 1343790: "['winner'] not in index"
Error processing match 1344515: "['winner'] not in index"
Error processing match 1345425: "['winner'] not in index"


Processing Matches:  48%|████▊     | 1837/3800 [00:09<00:11, 170.42it/s]

Error processing match 1349127: "['winner'] not in index"
Error processing match 1349387: "['winner'] not in index"
Error processing match 1349389: "['winner'] not in index"


Processing Matches:  49%|████▉     | 1872/3800 [00:09<00:11, 161.99it/s]

Error processing match 1354799: "['winner'] not in index"


Processing Matches:  50%|█████     | 1909/3800 [00:09<00:11, 166.75it/s]

Error processing match 1370791: "['winner'] not in index"


Processing Matches:  51%|█████     | 1943/3800 [00:10<00:11, 165.20it/s]



Processing Matches:  52%|█████▏    | 1960/3800 [00:10<00:11, 160.28it/s]

Error processing match 1380586: "['winner'] not in index"


Processing Matches:  55%|█████▌    | 2101/3800 [00:11<00:10, 166.35it/s]

Error processing match 1388214: "['winner'] not in index"


Processing Matches:  56%|█████▌    | 2134/3800 [00:11<00:10, 153.36it/s]

Error processing match 1391709: "['winner'] not in index"Error processing match 1392352: "['winner'] not in index"



Processing Matches:  58%|█████▊    | 2186/3800 [00:11<00:09, 161.77it/s]

Error processing match 1394770: "['winner'] not in index"


Processing Matches:  59%|█████▉    | 2255/3800 [00:12<00:09, 158.37it/s]

Error processing match 1398255: "['winner'] not in index"
Error processing match 1399055: "['winner'] not in index"


Processing Matches:  61%|██████    | 2309/3800 [00:12<00:09, 164.60it/s]

Error processing match 1400975: "['winner'] not in index"


Processing Matches:  62%|██████▏   | 2362/3800 [00:12<00:08, 163.73it/s]

Error processing match 1405327: "['winner'] not in index"
Error processing match 1407094: "['winner'] not in index"


Processing Matches:  64%|██████▍   | 2439/3800 [00:13<00:07, 177.34it/s]

Error processing match 1412534: "['winner'] not in index"
Error processing match 1415703: "['winner'] not in index"
Error processing match 1415711: "['winner'] not in index"
Error processing match 1415706: "['winner'] not in index"


Processing Matches:  68%|██████▊   | 2566/3800 [00:13<00:07, 169.30it/s]

Error processing match 1422042: "['winner'] not in index"
Error processing match 1422804: "['winner'] not in index"
Error processing match 1423460: "['winner'] not in index"
Error processing match 1423440: "['winner'] not in index"


Processing Matches:  68%|██████▊   | 2602/3800 [00:14<00:07, 165.39it/s]

Error processing match 1423474: "['winner'] not in index"
Error processing match 1424829: "['winner'] not in index"


Processing Matches:  70%|███████   | 2675/3800 [00:14<00:06, 165.05it/s]

Error processing match 1426049: "['winner'] not in index"


Processing Matches:  73%|███████▎  | 2769/3800 [00:15<00:06, 168.90it/s]

Error processing match 1431122: "['winner'] not in index"
Error processing match 1432196: "['winner'] not in index"


Processing Matches:  74%|███████▍  | 2821/3800 [00:15<00:06, 162.44it/s]

Error processing match 1434292: "['winner'] not in index"


Processing Matches:  77%|███████▋  | 2943/3800 [00:16<00:05, 165.68it/s]

Error processing match 1442989: "['winner'] not in index"


Processing Matches:  78%|███████▊  | 2977/3800 [00:16<00:05, 158.52it/s]

Error processing match 1444549: "['winner'] not in index"
Error processing match 1446762: "['winner'] not in index"


Processing Matches:  80%|███████▉  | 3027/3800 [00:16<00:04, 157.38it/s]

Error processing match 1447497: "['winner'] not in index"


Processing Matches:  82%|████████▏ | 3133/3800 [00:17<00:04, 161.58it/s]

Error processing match 1453519: "['winner'] not in index"


Processing Matches:  84%|████████▍ | 3183/3800 [00:17<00:04, 153.27it/s]

Error processing match 237242: "['winner'] not in index"
Error processing match 287862: "['winner'] not in index"


Processing Matches:  86%|████████▌ | 3259/3800 [00:18<00:03, 174.32it/s]

Error processing match 350050: "['winner'] not in index"


Processing Matches:  87%|████████▋ | 3296/3800 [00:18<00:02, 173.76it/s]

Error processing match 366707: "['winner'] not in index"
Error processing match 412681: "['winner'] not in index"


Processing Matches:  88%|████████▊ | 3331/3800 [00:18<00:02, 163.88it/s]

Error processing match 423788: "['winner'] not in index"


Processing Matches:  89%|████████▉ | 3380/3800 [00:19<00:02, 145.60it/s]

Error processing match 527683: "['winner'] not in index"
Error processing match 533284: "['winner'] not in index"
Error processing match 533282: "['winner'] not in index"
Error processing match 533292: "['winner'] not in index"


Processing Matches:  90%|█████████ | 3437/3800 [00:19<00:02, 163.33it/s]

Error processing match 534234: "['winner'] not in index"
Error processing match 566927: "['winner'] not in index"


Processing Matches:  92%|█████████▏| 3491/3800 [00:19<00:01, 163.63it/s]

Error processing match 571149: "['winner'] not in index"
Error processing match 582186: "['winner'] not in index"


Processing Matches:  93%|█████████▎| 3524/3800 [00:19<00:01, 149.83it/s]

Error processing match 640955: "['winner'] not in index"


Processing Matches:  96%|█████████▌| 3630/3800 [00:20<00:01, 162.84it/s]

Error processing match 730293: "['winner'] not in index"


Processing Matches:  98%|█████████▊| 3714/3800 [00:21<00:00, 156.45it/s]

Error processing match 902653: "['winner'] not in index"
Error processing match 951319: "['winner'] not in index"


Processing Matches: 100%|██████████| 3800/3800 [00:21<00:00, 175.87it/s]


attributes,match_id,team,team.1,team.2,team.3,gender,season,winner
0,1001349,Australia,Sri Lanka,Australia,Sri Lanka,male,2016/17,Sri Lanka
1,1007657,Zimbabwe,India,Zimbabwe,India,male,2016,India
2,1001351,Australia,Sri Lanka,Australia,Sri Lanka,male,2016/17,Sri Lanka
3,1001353,Australia,Sri Lanka,Australia,Sri Lanka,male,2016/17,Australia
4,1004729,Ireland,Hong Kong,Ireland,Hong Kong,male,2016,Hong Kong
...,...,...,...,...,...,...,...,...
3682,966739,Hong Kong,United Arab Emirates,Hong Kong,United Arab Emirates,male,2015/16,United Arab Emirates
3683,966753,Bangladesh,Sri Lanka,Bangladesh,Sri Lanka,male,2015/16,Bangladesh
3684,966763,Pakistan,Sri Lanka,Pakistan,Sri Lanka,male,2015/16,Pakistan
3685,966765,Bangladesh,India,Bangladesh,India,male,2015/16,India


In [21]:
if recalculated_matches:
    # Concatenate all match DataFrames
    matches_data = pd.concat(recalculated_matches, ignore_index=True)
    matches_data.columns = ['match_id', 'team1', 'team2', 'team1_duplicate', 'team2_duplicate', 'gender', 'season', 'winner']
    matches_data = matches_data.drop(columns=['team1_duplicate', 'team2_duplicate'])

    # Data quality checks
    if matches_data.empty:
        print("No match data consolidated.")
        raise ValueError("Consolidated match data is empty.")
    required_columns = ["match_id", "team1", "team2", "gender", "season", "winner"]
    missing_columns = [col for col in required_columns if col not in matches_data.columns]
    if missing_columns:
        print(f"Missing columns in matches data: {missing_columns}")
        raise ValueError(f"Missing columns in matches data: {missing_columns}")
    
    # Save matches_data directly to HDFS
    utils.ensure_hdfs_directory(client, config.PROCESSED_DATA_DIR)
    matches_csv_path = os.path.join(config.PROCESSED_DATA_DIR, 'matches.csv')
    csv_data = matches_data.to_csv(index=False)
    client.write(matches_csv_path, data=csv_data, overwrite=True)
    print('Matches data processing and saving completed successfully.')
else:
    print('No matches were successfully processed.')
    raise Exception('No matches were successfully processed.')

print(f'Successfully processed matches: {len(recalculated_matches)}')
print(f'Failed matches: {len(injured_matches)}')

Matches data processing and saving completed successfully.
Successfully processed matches: 3687
Failed matches: 113
