# Adding paths and files


In [None]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), "..", "..", ".."))
from configs import spark_config as config
from utils import spark_utils as utils

# Specify the directory where your CSV files are located
directory = os.path.join(config.RAW_DATA_DIR, "t20s_csv2")
print(directory)

client = utils.get_hdfs_client()

all_files = client.list(directory)
info_files = [os.path.join(directory, file) for file in all_files if "info" in file]

matches = []
# Print the list of CSV files
for info_file in info_files:
    matches.append(info_file.split("/")[-1])

match_ids = []
for csv_file in matches:
    match_ids.append(csv_file.split("_")[0])

/usr/ravi/t20/data/1_rawData/t20s_csv2
[[34m2024-11-24T13:58:03.457+0530[0m] {[34mbase.py:[0m84} INFO[0m - Retrieving connection 'webhdfs_default'[0m
[[34m2024-11-24T13:58:03.460+0530[0m] {[34mwebhdfs.py:[0m82} INFO[0m - Trying to connect to 192.168.245.142:9870[0m
[[34m2024-11-24T13:58:03.462+0530[0m] {[34mwebhdfs.py:[0m86} INFO[0m - Trying namenode 192.168.245.142[0m
[[34m2024-11-24T13:58:03.464+0530[0m] {[34mclient.py:[0m192} INFO[0m - Instantiated <InsecureClient(url='http://192.168.245.142:9870/')>.[0m
[[34m2024-11-24T13:58:03.466+0530[0m] {[34mclient.py:[0m320} INFO[0m - Fetching status for '/'.[0m
[[34m2024-11-24T13:58:03.479+0530[0m] {[34mwebhdfs.py:[0m96} INFO[0m - Using namenode 192.168.245.142 for hook[0m
[[34m2024-11-24T13:58:03.481+0530[0m] {[34mclient.py:[0m1116} INFO[0m - Listing '/usr/ravi/t20/data/1_rawData/t20s_csv2'.[0m


In [3]:
import pandas as pd
import io

with client.read(
    os.path.join(config.RAW_DATA_DIR, "t20s_csv2", f"{info_files[1]}")
) as reader:
    data = reader.read()
df = pd.read_csv(
    io.StringIO(data.decode("utf-8")),
    header=None,
    names=["type", "heading", "subkey", "players", "player_id"],
    skipinitialspace=True,
).drop("type", axis=1)
df.head(10)

[[34m2024-11-24T13:58:23.656+0530[0m] {[34mclient.py:[0m724} INFO[0m - Reading file '/usr/ravi/t20/data/1_rawData/t20s_csv2/1001351_info.csv'.[0m


Unnamed: 0,heading,subkey,players,player_id
0,2.1.0,,,
1,balls_per_over,6,,
2,team,Australia,,
3,team,Sri Lanka,,
4,gender,male,,
5,season,2016/17,,
6,date,2017/02/19,,
7,event,Sri Lanka in Australia T20I Series,,
8,match_number,2,,
9,venue,"Simonds Stadium, South Geelong",,


In [None]:
df["subkey"][5]

'2016/17'

In [None]:
match_id = pd.to_numeric(info_files[0].split("/")[-1].split("_")[0])
match_id

1001349

In [6]:
# Filter dataframes based on the heading
players_df = df[df["heading"] == "player"].drop(["heading", "player_id"], axis=1)
registry_df = df[df["heading"] == "registry"].drop("heading", axis=1)

# Join on the 'players' column with 'player_id' from the registry dataframe
merged_df = players_df.merge(
    registry_df[["players", "player_id"]], on="players", how="inner"
)

# Display the merged dataframe
merged_df.rename(columns={"players": "player", "subkey": "country"}, inplace=True)
merged_df["match_id"] = match_id
merged_df

Unnamed: 0,country,player,player_id,match_id
0,Australia,M Klinger,b970a03f,1001349
1,Australia,AJ Finch,b8d490fd,1001349
2,Australia,BR Dunk,272d796e,1001349
3,Australia,MC Henriques,32198ae0,1001349
4,Australia,TM Head,12b610c2,1001349
5,Australia,AJ Turner,ff1e12a0,1001349
6,Australia,JP Faulkner,808f425a,1001349
7,Australia,TD Paine,5748e866,1001349
8,Australia,PJ Cummins,ded9240e,1001349
9,Australia,AJ Tye,7c7d63a2,1001349


In [7]:
dataframes = pd.DataFrame(
    columns=["country", "player", "player_id", "season", "match_id"]
)
injured_matches = []
import logging

logging.getLogger("hdfs.client").setLevel(logging.WARNING)
from tqdm import tqdm

for info_file in tqdm(info_files):
    match_id = pd.to_numeric(info_file.split("/")[-1].split("_")[0])
    try:
        with client.read(
            os.path.join(config.RAW_DATA_DIR, "t20s_csv2", f"{match_id}_info.csv")
        ) as reader:
            data = reader.read()
        df = pd.read_csv(
            io.StringIO(data.decode("utf-8")),
            header=None,
            names=["type", "heading", "subkey", "players", "player_id"],
            skipinitialspace=True,
        ).drop("type", axis=1)
        players_df = df[df["heading"] == "player"].drop(
            ["heading", "player_id"], axis=1
        )
        registry_df = df[df["heading"] == "registry"].drop("heading", axis=1)
        merged_df = players_df.merge(
            registry_df[["players", "player_id"]], on="players", how="inner"
        )
        merged_df.rename(
            columns={"players": "player", "subkey": "country"}, inplace=True
        )
        season = df["subkey"][5]
        merged_df["match_id"] = match_id
        merged_df["season"] = season
        if len(merged_df) != 22:
            raise Exception("Injured Match")
        dataframes = pd.concat([dataframes, merged_df])
    except:
        injured_matches.append(match_id)
print(injured_matches)

  0%|          | 0/3825 [00:00<?, ?it/s]

100%|██████████| 3825/3825 [01:47<00:00, 35.64it/s]

[1173070, 1223952, 1251954, 1262758, 1262760, 1263164, 1263166, 1263167, 1267682, 1268757, 1270834, 1270835, 1273136, 1273138, 1273144, 1274596, 1274597, 1286970, 1289274, 1298163, 1382164, 1388204, 1393329, 1411261, 1425126, 1425660, 1432443, 1434291, 1443786, 1443789, 1444961, 1449012, 1452625, 222678]





In [8]:
dataframes

Unnamed: 0,country,player,player_id,season,match_id
0,Australia,AJ Finch,b8d490fd,2016/17,1001349
1,Australia,M Klinger,b970a03f,2016/17,1001349
2,Australia,TM Head,12b610c2,2016/17,1001349
3,Australia,MC Henriques,32198ae0,2016/17,1001349
4,Australia,AJ Turner,ff1e12a0,2016/17,1001349
...,...,...,...,...,...
17,Sri Lanka,NLTC Perera,0f12f9df,2016,995469
18,Sri Lanka,SS Pathirana,753c95b9,2016,995469
19,Sri Lanka,S Prasanna,f78e7113,2016,995469
20,Sri Lanka,SMSM Senanayake,4c4fa80b,2016,995469


In [9]:
len(dataframes) / 22, len(injured_matches)

(3791.0, 34)

In [10]:
client.write(
    os.path.join(config.PROCESSED_DATA_DIR, "match_players.csv"),
    dataframes.to_csv(index=False),
    overwrite=True,
)

# Individual player's data


In [11]:
import polars as pl

players = (
    pl.from_pandas(dataframes)
    .drop("match_id")
    .select("player", "country", "player_id")
    .unique()
)
players

player,country,player_id
str,str,str
"""G Bradley""","""Hong Kong""","""7c423f38"""
"""Bashir Ahmad""","""Switzerland""","""ffafcfce"""
"""Shoriful Islam""","""Bangladesh""","""bb34fd31"""
"""Nawaf Ahmed""","""Kuwait""","""bfe33045"""
"""DO Leicher""","""Namibia""","""e28bb0b9"""
…,…,…
"""C Chatphaisan""","""Thailand""","""85c56be8"""
"""E Walker""","""Isle of Man""","""b312a9c1"""
"""Simandeep Singh""","""Hong Kong""","""2a242d58"""
"""Noor Hayati Zakaria""","""Malaysia""","""31382975"""


In [12]:
client.write(
    os.path.join(config.PROCESSED_DATA_DIR, "players.csv"),
    players.write_csv(),
    overwrite=True,
)