In [1]:
from src.data_cleaning.data_cleaning import (
    clean_fighter_tott,
    clean_event_details,
    clean_fighter_details,
    clean_fight_stats,
    clean_fight_results
)
from pathlib import Path
import pandas as pd

## Read data

All raw data except `ufc_fight_details.csv` that doesn't add anything.

In [58]:
raw_data_path = Path('./data/raw')

df_fighter_detail = pd.read_csv(Path(raw_data_path, 'ufc_fighter_details.csv'))
df_event_details = pd.read_csv(Path(raw_data_path, 'ufc_event_details.csv'))
df_fighter_tott = pd.read_csv(Path(raw_data_path, 'ufc_fighter_tott.csv'))
df_fight_stats = pd.read_csv(Path(raw_data_path, 'ufc_fight_stats.csv'))
df_fight_results = pd.read_csv(Path(raw_data_path, 'ufc_fight_results.csv'))

## Clean Data

* Cleans fighter stats by adjusting column names, converting string-based measurements to numerical, formatting dates, extracting IDs, and removing unneeded columns.
* Processes event details to extract IDs, convert dates, and split location data.
* Standardizes fighter details by cleaning column names and extracting IDs.
* Refines fight stats by organizing columns, splitting key data for clarity, and dropping irrelevant percentage columns.
* Enhances fight results by extracting IDs, clarifying outcomes, detailing bouts, converting time formats, and eliminating superfluous columns.

In [3]:
df_fighter_detail_clean = clean_fighter_details(df_fighter_detail)
df_event_details_clean = clean_event_details(df_event_details)
df_fighter_tott_clean = clean_fighter_tott(df_fighter_tott)
df_fight_stats_clean = clean_fight_stats(df_fight_stats)
df_fight_results_clean = clean_fight_results(df_fight_results)

## Save 
Save clean data as csv into separate folder

In [6]:
clean_data_path = Path('./data/clean')

df_fighter_detail_clean.to_csv(Path(clean_data_path,'ufc_fighter_details_clean.csv'), index=False)
df_event_details_clean.to_csv(Path(clean_data_path,'ufc_event_details_clean.csv'), index=False)
df_fighter_tott_clean.to_csv(Path(clean_data_path,'ufc_fighter_tott_clean.csv'), index=False)
df_fight_stats_clean.to_csv(Path(clean_data_path,'ufc_fight_stats_clean.csv'), index=False)
df_fight_results_clean.to_csv(Path(clean_data_path,'ufc_fight_results_clean.csv'), index=False)

## Data dictionary

### ufc_fighter_details_clean
* first: First name of the fighter.
* last: Last name of the fighter.
* nickname: Fighter's nickname.
* fighter_id: Unique identifier for the fighter.

### ufc_event_details_clean
* event: Name of the UFC event.
* date: Date when the event took place.
* location: Location where the event was held.
* event_id: Unique identifier for the event.
* city: City where the event took place.
* state: State where the event took place (if applicable).
* country: Country where the event took place.

### ufc_fighter_tott_clean
* fighter: Name of the fighter.
* height: Height of the fighter.
* reach: Reach of the fighter.
* stance: Fighting stance of the fighter.
* dob: Date of birth of the fighter.
* fighter_id: Unique identifier for the fighter.

### ufc_fight_stats_clean
* event: Name of the UFC event.
* bout: Details about the fight bout.
* round: The round number.
* fighter: Name of the fighter.
* kd: Number of knockdowns.
* subatt: Number of submission attempts.
* rev: Number of reversals.
* ctrl: Control time in the fight.
* sigstr_throw: Significant strikes thrown.
* sigstr_land: Significant strikes landed.
* total_str_throw: Total strikes thrown.
* total_str_land: Total strikes landed.
* td_throw: Takedowns attempted.
* td_land: Takedowns succeeded.
* head_throw: Head strikes thrown.
* head_land: Head strikes landed.
* body_throw: Body strikes thrown.
* body_land: Body strikes landed.
* leg_throw: Leg strikes thrown.
* leg_land: Leg strikes landed.
* distance_throw: Strikes thrown at a distance.
* distance_land: Strikes landed at a distance.
* clinch_throw: Strikes thrown in a clinch.
* clinch_land: Strikes landed in a clinch.
* ground_throw: Ground strikes thrown.
* ground_land: Ground strikes landed.

### ufc_fight_results_clean
* event: Name of the UFC event.
* outcome: Outcome of the fight.
* weightclass: Weight class of the fight.
* method: Method of victory.
* round: Final round of the fight.
* time: Time the fight ended in the final round.
* fight_id: Unique identifier for the fight.
* fighter1: Name of fighter 1.
* fighter2: Name of fighter 2.