# Adding paths and files

In [3]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), "..", "..", ".."))
from configs import spark_config as config
from utils import spark_utils as utils

# Specify the directory where your CSV files are located
directory = os.path.join(config.RAW_DATA_DIR, "t20s_csv2")
print(directory)

client = utils.get_hdfs_client()

all_files = client.list(directory)
info_files = [os.path.join(directory, file) for file in all_files if "info" in file]

matches = []
# Print the list of CSV files
for info_file in info_files:
    matches.append(info_file.split("/")[-1])

match_ids = []
for csv_file in matches:
    match_ids.append(csv_file.split("_")[0])

/usr/ravi/t20/data/1_rawData/t20s_csv2
[[34m2024-11-24T13:53:58.395+0530[0m] {[34mbase.py:[0m84} INFO[0m - Retrieving connection 'webhdfs_default'[0m
[[34m2024-11-24T13:53:58.397+0530[0m] {[34mwebhdfs.py:[0m82} INFO[0m - Trying to connect to 192.168.245.142:9870[0m
[[34m2024-11-24T13:53:58.398+0530[0m] {[34mwebhdfs.py:[0m86} INFO[0m - Trying namenode 192.168.245.142[0m
[[34m2024-11-24T13:53:58.401+0530[0m] {[34mclient.py:[0m192} INFO[0m - Instantiated <InsecureClient(url='http://192.168.245.142:9870/')>.[0m
[[34m2024-11-24T13:53:58.403+0530[0m] {[34mclient.py:[0m320} INFO[0m - Fetching status for '/'.[0m
[[34m2024-11-24T13:53:58.433+0530[0m] {[34mwebhdfs.py:[0m96} INFO[0m - Using namenode 192.168.245.142 for hook[0m
[[34m2024-11-24T13:53:58.435+0530[0m] {[34mclient.py:[0m1116} INFO[0m - Listing '/usr/ravi/t20/data/1_rawData/t20s_csv2'.[0m


In [4]:
import pandas as pd
import io


def process_single_match(client, match_id: str) -> pd.DataFrame:
    """
    Process a single match file and return the processed DataFrame or None on failure.

    Args:
        client: The HDFS client for file operations.
        match_id: Unique ID for each match.

    Returns:
        DataFrame of the processed match or None if an error occurs.
    """
    try:
        with client.read(
            os.path.join(config.RAW_DATA_DIR, "t20s_csv2", f"{match_id}_info.csv")
        ) as reader:
            data = reader.read()
        match_df = pd.read_csv(
            io.StringIO(data.decode("utf-8")),
            header=None,
            names=["col1", "attributes", "values", "players", "code"],
        )
        match_df = match_df.drop(columns=["col1", "players", "code"]).T
        match_df.columns = match_df.iloc[0]
        match_df["match_id"] = match_id
        match_df = match_df[
            ["match_id", "team", "team", "gender", "season", "winner"]
        ].drop("attributes")
        match_df = match_df.reset_index(drop=True)
        return match_df
    except Exception as e:
        print(f"Error processing match {match_id}: {e}")
        return None

In [5]:
from tqdm import tqdm
import concurrent.futures
import logging

logging.basicConfig(level=logging.ERROR)
recalculated_matches = []

# Set the logging level for the HDFS client to WARNING
logging.getLogger("hdfs.client").setLevel(logging.CRITICAL)
injured_matches = []

# Process each match concurrently using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {
        executor.submit(process_single_match, client, match_id): match_id
        for match_id in match_ids
    }

    # Gather results with tqdm for progress tracking
    for future in tqdm(
        concurrent.futures.as_completed(futures),
        total=len(futures),
        desc="Processing Matches",
    ):
        match = futures[future]
        try:
            result = future.result()
            if result is not None:
                recalculated_matches.append(result)
            else:
                injured_matches.append(match)
        except Exception as e:
            injured_matches.append(match)

if not recalculated_matches:
    print("No matches were successfully processed.")
    raise ValueError("No matches were successfully processed.")

matches_data = pd.concat(recalculated_matches, ignore_index=True)
matches_data



Processing Matches:   0%|          | 0/3825 [00:00<?, ?it/s]



Processing Matches:   0%|          | 16/3825 [00:00<01:24, 45.27it/s]

Error processing match 1041617: "['winner'] not in index"


Processing Matches:   1%|▏         | 53/3825 [00:00<01:00, 62.09it/s]

Error processing match 1115799: "['winner'] not in index"


Processing Matches:   2%|▏         | 91/3825 [00:01<00:56, 66.56it/s]

Error processing match 1123209: "['winner'] not in index"


Processing Matches:   4%|▎         | 143/3825 [00:02<00:53, 68.29it/s]

Error processing match 1141835: "['winner'] not in index"
Error processing match 1142504: "['winner'] not in index"


Processing Matches:   4%|▍         | 168/3825 [00:02<00:51, 71.65it/s]

Error processing match 1144172: "['winner'] not in index"
Error processing match 1144991: "['winner'] not in index"


Processing Matches:   5%|▌         | 195/3825 [00:03<00:52, 69.12it/s]

Error processing match 1147733: "['winner'] not in index"


Processing Matches:   7%|▋         | 257/3825 [00:03<00:48, 72.89it/s]

Error processing match 1157710: "['winner'] not in index"


Processing Matches:   8%|▊         | 299/3825 [00:04<00:48, 72.47it/s]

Error processing match 1172470: "['winner'] not in index"


Processing Matches:   9%|▉         | 351/3825 [00:05<00:44, 77.50it/s]

Error processing match 1173066: "['winner'] not in index"
Error processing match 1176795: "['winner'] not in index"
Error processing match 1177484: "['winner'] not in index"


Processing Matches:  10%|▉         | 368/3825 [00:05<00:47, 73.24it/s]



Processing Matches:  11%|█         | 404/3825 [00:05<00:46, 73.07it/s]

Error processing match 1183544: "['winner'] not in index"
Error processing match 1183527: "['winner'] not in index"


Processing Matches:  11%|█         | 429/3825 [00:06<00:52, 65.09it/s]

Error processing match 1185187: "['winner'] not in index"


Processing Matches:  12%|█▏        | 447/3825 [00:06<00:46, 72.69it/s]

Error processing match 1186492: "['winner'] not in index"
Error processing match 1187669: "['winner'] not in index"
Error processing match 1187679: "['winner'] not in index"
Error processing match 1187680: "['winner'] not in index"


Processing Matches:  12%|█▏        | 474/3825 [00:06<00:42, 78.21it/s]

Error processing match 1188380: "['winner'] not in index"


Processing Matches:  13%|█▎        | 490/3825 [00:07<00:44, 75.56it/s]

Error processing match 1190751: "['winner'] not in index"
Error processing match 1190607: "['winner'] not in index"


Processing Matches:  14%|█▍        | 532/3825 [00:07<00:43, 76.03it/s]

Error processing match 1197398: "['winner'] not in index"


Processing Matches:  15%|█▍        | 565/3825 [00:08<00:43, 74.78it/s]

Error processing match 1198244: "['winner'] not in index"


Processing Matches:  17%|█▋        | 660/3825 [00:09<00:41, 77.16it/s]

Error processing match 1203677: "['winner'] not in index"


Processing Matches:  19%|█▉        | 727/3825 [00:10<00:46, 67.27it/s]

Error processing match 1233956: "['winner'] not in index"


Processing Matches:  20%|█▉        | 763/3825 [00:11<00:47, 64.52it/s]

Error processing match 1249240: "['winner'] not in index"


Processing Matches:  21%|██        | 805/3825 [00:11<00:41, 73.45it/s]

Error processing match 1257949: "['winner'] not in index"


Processing Matches:  22%|██▏       | 823/3825 [00:11<00:43, 69.66it/s]

Error processing match 1263164: "['winner'] not in index"
Error processing match 1263472: "['winner'] not in index"
Error processing match 1263167: "['winner'] not in index"
Error processing match 1263166: "['winner'] not in index"
Error processing match 1263621: "['winner'] not in index"
Error processing match 1263573: "['winner'] not in index"


Processing Matches:  23%|██▎       | 865/3825 [00:12<00:40, 72.67it/s]

Error processing match 1267311: "['winner'] not in index"


Processing Matches:  24%|██▍       | 925/3825 [00:13<00:39, 74.08it/s]

Error processing match 1273415: "['winner'] not in index"


Processing Matches:  27%|██▋       | 1028/3825 [00:14<00:36, 75.99it/s]

Error processing match 1278691: "['winner'] not in index"


Processing Matches:  29%|██▉       | 1103/3825 [00:15<00:38, 70.45it/s]

Error processing match 1286674: "['winner'] not in index"


Processing Matches:  31%|███▏      | 1203/3825 [00:17<00:34, 76.67it/s]

Error processing match 1298152: "['winner'] not in index"


Processing Matches:  36%|███▌      | 1358/3825 [00:19<00:33, 74.62it/s]

Error processing match 1317149: "['winner'] not in index"
Error processing match 1317635: "['winner'] not in index"
Error processing match 1317488: "['winner'] not in index"
Error processing match 1317639: "['winner'] not in index"


Processing Matches:  37%|███▋      | 1402/3825 [00:19<00:33, 71.30it/s]

Error processing match 1320193: "['winner'] not in index"
Error processing match 1320192: "['winner'] not in index"


Processing Matches:  37%|███▋      | 1420/3825 [00:20<00:32, 75.01it/s]

Error processing match 1320210: "['winner'] not in index"


Processing Matches:  40%|███▉      | 1527/3825 [00:21<00:31, 73.66it/s]

Error processing match 1322277: "['winner'] not in index"
Error processing match 1322362: "['winner'] not in index"


Processing Matches:  43%|████▎     | 1630/3825 [00:22<00:28, 76.38it/s]

Error processing match 1334418: "['winner'] not in index"


Processing Matches:  44%|████▎     | 1673/3825 [00:23<00:34, 63.16it/s]

Error processing match 1336982: "['winner'] not in index"


Processing Matches:  45%|████▍     | 1721/3825 [00:24<00:28, 74.28it/s]

Error processing match 1343743: "['winner'] not in index"
Error processing match 1339617: "['winner'] not in index"
Error processing match 1343747: "['winner'] not in index"
Error processing match 1343742: "['winner'] not in index"
Error processing match 1343746: "['winner'] not in index"
Error processing match 1343758: "['winner'] not in index"


Processing Matches:  45%|████▌     | 1739/3825 [00:24<00:27, 75.80it/s]

Error processing match 1343767: "['winner'] not in index"


Processing Matches:  46%|████▌     | 1758/3825 [00:24<00:28, 72.02it/s]

Error processing match 1343790: "['winner'] not in index"


Processing Matches:  47%|████▋     | 1783/3825 [00:25<00:29, 69.88it/s]

Error processing match 1344515: "['winner'] not in index"
Error processing match 1345425: "['winner'] not in index"


Processing Matches:  47%|████▋     | 1816/3825 [00:25<00:29, 68.13it/s]

Error processing match 1349127: "['winner'] not in index"


Processing Matches:  48%|████▊     | 1834/3825 [00:25<00:29, 66.67it/s]

Error processing match 1349387: "['winner'] not in index"
Error processing match 1349389: "['winner'] not in index"
Error processing match 1354799: "['winner'] not in index"


Processing Matches:  49%|████▊     | 1863/3825 [00:26<00:25, 77.47it/s]



Processing Matches:  49%|████▉     | 1880/3825 [00:26<00:26, 74.78it/s]

Error processing match 1370791: "['winner'] not in index"


Processing Matches:  52%|█████▏    | 1980/3825 [00:27<00:27, 68.14it/s]

Error processing match 1380586: "['winner'] not in index"


Processing Matches:  55%|█████▍    | 2091/3825 [00:29<00:25, 68.33it/s]

Error processing match 1388214: "['winner'] not in index"


Processing Matches:  55%|█████▌    | 2117/3825 [00:29<00:22, 74.72it/s]

Error processing match 1391709: "['winner'] not in index"
Error processing match 1392352: "['winner'] not in index"


Processing Matches:  57%|█████▋    | 2178/3825 [00:30<00:21, 76.93it/s]

Error processing match 1394770: "['winner'] not in index"


Processing Matches:  59%|█████▊    | 2246/3825 [00:31<00:20, 75.91it/s]

Error processing match 1398255: "['winner'] not in index"


Processing Matches:  59%|█████▉    | 2268/3825 [00:31<00:19, 80.90it/s]

Error processing match 1399055: "['winner'] not in index"


Processing Matches:  60%|██████    | 2307/3825 [00:32<00:20, 74.91it/s]

Error processing match 1400975: "['winner'] not in index"


Processing Matches:  61%|██████▏   | 2350/3825 [00:33<00:35, 42.09it/s]

Error processing match 1405327: "['winner'] not in index"


Processing Matches:  62%|██████▏   | 2372/3825 [00:33<00:25, 57.55it/s]

Error processing match 1407094: "['winner'] not in index"


Processing Matches:  63%|██████▎   | 2418/3825 [00:34<00:19, 71.44it/s]

Error processing match 1412534: "['winner'] not in index"
Error processing match 1415703: "['winner'] not in index"
Error processing match 1415706: "['winner'] not in index"
Error processing match 1415711: "['winner'] not in index"


Processing Matches:  67%|██████▋   | 2555/3825 [00:36<00:17, 73.42it/s]

Error processing match 1422042: "['winner'] not in index"
Error processing match 1422804: "['winner'] not in index"
Error processing match 1423440: "['winner'] not in index"


Processing Matches:  68%|██████▊   | 2583/3825 [00:36<00:16, 74.41it/s]

Error processing match 1423460: "['winner'] not in index"
Error processing match 1423474: "['winner'] not in index"


Processing Matches:  68%|██████▊   | 2614/3825 [00:37<00:16, 74.18it/s]

Error processing match 1424829: "['winner'] not in index"


Processing Matches:  69%|██████▉   | 2630/3825 [00:37<00:15, 75.42it/s]

Error processing match 1426049: "['winner'] not in index"


Processing Matches:  72%|███████▏  | 2757/3825 [00:38<00:13, 77.38it/s]

Error processing match 1431122: "['winner'] not in index"
Error processing match 1432196: "['winner'] not in index"


Processing Matches:  73%|███████▎  | 2807/3825 [00:39<00:13, 73.90it/s]

Error processing match 1433377: "['winner'] not in index"
Error processing match 1434292: "['winner'] not in index"


Processing Matches:  77%|███████▋  | 2945/3825 [00:41<00:12, 68.38it/s]

Error processing match 1442989: "['winner'] not in index"


Processing Matches:  77%|███████▋  | 2961/3825 [00:42<00:13, 63.96it/s]

Error processing match 1444549: "['winner'] not in index"


Processing Matches:  79%|███████▊  | 3003/3825 [00:42<00:10, 77.13it/s]

Error processing match 1446762: "['winner'] not in index"


Processing Matches:  79%|███████▉  | 3027/3825 [00:42<00:11, 72.22it/s]

Error processing match 1447497: "['winner'] not in index"


Processing Matches:  81%|████████▏ | 3113/3825 [00:43<00:09, 78.94it/s]

Error processing match 1453519: "['winner'] not in index"


Processing Matches:  84%|████████▎ | 3202/3825 [00:45<00:08, 71.56it/s]

Error processing match 1459720: "['winner'] not in index"
Error processing match 237242: "['winner'] not in index"


Processing Matches:  84%|████████▍ | 3228/3825 [00:45<00:08, 72.96it/s]

Error processing match 287862: "['winner'] not in index"


Processing Matches:  85%|████████▌ | 3264/3825 [00:46<00:07, 73.95it/s]

Error processing match 350050: "['winner'] not in index"


Processing Matches:  87%|████████▋ | 3311/3825 [00:46<00:06, 76.67it/s]

Error processing match 366707: "['winner'] not in index"
Error processing match 412681: "['winner'] not in index"


Processing Matches:  87%|████████▋ | 3344/3825 [00:47<00:06, 73.79it/s]

Error processing match 423788: "['winner'] not in index"


Processing Matches:  89%|████████▊ | 3389/3825 [00:47<00:05, 74.90it/s]

Error processing match 527683: "['winner'] not in index"


Processing Matches:  89%|████████▉ | 3409/3825 [00:48<00:05, 75.23it/s]

Error processing match 533282: "['winner'] not in index"
Error processing match 533284: "['winner'] not in index"


Processing Matches:  89%|████████▉ | 3417/3825 [00:48<00:05, 74.44it/s]

Error processing match 533292: "['winner'] not in index"


Processing Matches:  90%|█████████ | 3460/3825 [00:48<00:04, 75.48it/s]

Error processing match 534234: "['winner'] not in index"


Processing Matches:  91%|█████████ | 3478/3825 [00:48<00:04, 80.02it/s]

Error processing match 566927: "['winner'] not in index"


Processing Matches:  91%|█████████▏| 3497/3825 [00:49<00:04, 78.26it/s]

Error processing match 571149: "['winner'] not in index"
Error processing match 582186: "['winner'] not in index"


Processing Matches:  93%|█████████▎| 3542/3825 [00:49<00:03, 80.53it/s]

Error processing match 640955: "['winner'] not in index"


Processing Matches:  95%|█████████▌| 3644/3825 [00:51<00:02, 76.17it/s]

Error processing match 730293: "['winner'] not in index"


Processing Matches:  98%|█████████▊| 3730/3825 [00:52<00:01, 73.16it/s]

Error processing match 902653: "['winner'] not in index"


Processing Matches:  98%|█████████▊| 3755/3825 [00:52<00:00, 73.47it/s]

Error processing match 951319: "['winner'] not in index"


Processing Matches: 100%|██████████| 3825/3825 [00:53<00:00, 71.53it/s]


attributes,match_id,team,team.1,team.2,team.3,gender,season,winner
0,1019981,New Zealand,Bangladesh,New Zealand,Bangladesh,male,2016/17,New Zealand
1,1031665,West Indies,England,West Indies,England,male,2017,West Indies
2,1001349,Australia,Sri Lanka,Australia,Sri Lanka,male,2016/17,Sri Lanka
3,1034825,India,England,India,England,male,2016/17,England
4,1019979,New Zealand,Bangladesh,New Zealand,Bangladesh,male,2016/17,New Zealand
...,...,...,...,...,...,...,...,...
3705,966761,India,United Arab Emirates,India,United Arab Emirates,male,2015/16,India
3706,966751,India,Pakistan,India,Pakistan,male,2015/16,India
3707,995469,Sri Lanka,Australia,Sri Lanka,Australia,male,2016,Australia
3708,995467,Sri Lanka,Australia,Sri Lanka,Australia,male,2016,Australia


In [8]:
if recalculated_matches:
    # Concatenate all match DataFrames
    matches_data = pd.concat(recalculated_matches, ignore_index=True)
    matches_data.columns = [
        "match_id",
        "team1",
        "team2",
        "team1_duplicate",
        "team2_duplicate",
        "gender",
        "season",
        "winner",
    ]
    matches_data = matches_data.drop(columns=["team1_duplicate", "team2_duplicate"])

    # Data quality checks
    if matches_data.empty:
        print("No match data consolidated.")
        raise ValueError("Consolidated match data is empty.")
    required_columns = ["match_id", "team1", "team2", "gender", "season", "winner"]
    missing_columns = [
        col for col in required_columns if col not in matches_data.columns
    ]
    if missing_columns:
        print(f"Missing columns in matches data: {missing_columns}")
        raise ValueError(f"Missing columns in matches data: {missing_columns}")

    # Save matches_data directly to HDFS
    utils.ensure_hdfs_directory(client, config.PROCESSED_DATA_DIR)
    matches_csv_path = os.path.join(config.PROCESSED_DATA_DIR, "matches.csv")
    csv_data = matches_data.to_csv(index=False)
    client.write(matches_csv_path, data=csv_data, overwrite=True)

    print(f"Successfully processed matches: {len(recalculated_matches)}")
    print(f"Failed matches: {len(injured_matches)}")
    print("Matches data processing and saving completed successfully.")
else:
    print("No matches were successfully processed.")
    raise Exception("No matches were successfully processed.")

Successfully processed matches: 3710
Failed matches: 115
Matches data processing and saving completed successfully.
