In [1]:
import pandas as pd
import numpy as np

# Load data
ufc_data_cleaned = pd.read_csv('../data/processed/ufc_data_final.csv')

# 1. Check for missing values
print("=== Basic Missing Values Check ===")
print(ufc_data_cleaned.isnull().sum())  # Number of missing values in each column
print(f"Total missing values: {ufc_data_cleaned.isnull().sum().sum()}")  # Total number of missing values

# 2. Check for duplicate rows
print("\n=== Duplicate Rows Check ===")
print(f"Total duplicate rows: {ufc_data_cleaned.duplicated().sum()}")

# 3. Check for empty or whitespace-only strings
print("\n=== Empty or Whitespace Strings Check ===")
for col in ufc_data_cleaned.select_dtypes(include=['object']).columns:
    empty_or_whitespace = ufc_data_cleaned[col].str.isspace().sum()
    print(f"{col}: {empty_or_whitespace} empty or whitespace strings")

# 4. Check for zero values
print("\n=== Zero Values Check ===")
zero_counts = (ufc_data_cleaned == 0).sum()
print(zero_counts[zero_counts > 0])  # Show only columns with zeros

# 5. Check for outliers
print("\n=== Outliers Check ===")
for col in ufc_data_cleaned.select_dtypes(include=[np.number]).columns:
    q1 = ufc_data_cleaned[col].quantile(0.25)
    q3 = ufc_data_cleaned[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = ufc_data_cleaned[(ufc_data_cleaned[col] < lower_bound) | (ufc_data_cleaned[col] > upper_bound)]
    print(f"{col}: {len(outliers)} outliers")

# 6. Check data types
print("\n=== Data Types ===")
print(ufc_data_cleaned.dtypes)

# 7. Check unique values in each column
print("\n=== Unique Values Check ===")
for col in ufc_data_cleaned.columns:
    unique_values = ufc_data_cleaned[col].nunique()
    print(f"{col}: {unique_values} unique values")

# 8. Descriptive statistics
print("\n=== Descriptive Statistics ===")
print(ufc_data_cleaned.describe(include='all'))  # General info for numeric and categorical data

# 9. Check for negative values (if not expected)
print("\n=== Negative Values Check ===")
negative_counts = (ufc_data_cleaned.select_dtypes(include=[np.number]) < 0).sum()
print(negative_counts[negative_counts > 0])  # Show only columns with negative values

# 10. Check if data type matches content (e.g., numeric strings)
print("\n=== Data Consistency Check (Numeric Columns) ===")
for col in ufc_data_cleaned.select_dtypes(include=['object']).columns:
    try:
        # Try converting to numbers
        ufc_data_cleaned[col].astype(float)
        print(f"{col}: Can be converted to numeric")
    except ValueError:
        print(f"{col}: Cannot be converted to numeric")

# Summary
print("\n=== Final Check Summary ===")
print("1. Missing values handled? Check above.")
print("2. Duplicates resolved? Check above.")
print("3. Data distribution looks reasonable? Check outliers section.")

=== Basic Missing Values Check ===
event_name            0
date                  0
location              0
r_fighter             0
b_fighter             0
                     ..
td_acc_total_diff     0
str_def_total_diff    0
td_def_total_diff     0
sub_avg_diff          0
td_avg_diff           0
Length: 97, dtype: int64
Total missing values: 0

=== Duplicate Rows Check ===
Total duplicate rows: 0

=== Empty or Whitespace Strings Check ===
event_name: 0 empty or whitespace strings
date: 0 empty or whitespace strings
location: 0 empty or whitespace strings
r_fighter: 0 empty or whitespace strings
b_fighter: 0 empty or whitespace strings
winner: 0 empty or whitespace strings
weight_class: 0 empty or whitespace strings
gender: 0 empty or whitespace strings
method: 0 empty or whitespace strings
referee: 0 empty or whitespace strings
r_stance: 0 empty or whitespace strings
b_stance: 0 empty or whitespace strings

=== Zero Values Check ===
is_title_bout         7024
r_kd                  58

In [2]:
import numpy as np

# Select numeric columns
numeric_columns = ufc_data_cleaned.select_dtypes(include=[np.number])

# Create a DataFrame to store outliers with column and value info
outliers = pd.DataFrame(columns=["Column", "Value", "Index"])

for col in numeric_columns.columns:
    Q1 = numeric_columns[col].quantile(0.25)  # First quartile
    Q3 = numeric_columns[col].quantile(0.75)  # Third quartile
    IQR = Q3 - Q1  # Interquartile range

    # Outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Select outliers
    outliers_in_col = ufc_data_cleaned[(numeric_columns[col] < lower_bound) | (numeric_columns[col] > upper_bound)]

    # Add outliers to the overall DataFrame
    for idx, value in outliers_in_col[col].items():
        outliers = pd.concat([outliers, pd.DataFrame({
            "Column": [col],
            "Value": [value],
            "Index": [idx]
        })])

# Remove duplicates (one row can be an outlier in several columns)
outliers = outliers.drop_duplicates(subset=["Index", "Column"])

# Show 5 examples of outliers
print(outliers.head(10))

          Column Value Index
0  is_title_bout     1    25
0  is_title_bout     1    62
0  is_title_bout     1    99
0  is_title_bout     1   100
0  is_title_bout     1   122
0  is_title_bout     1   123
0  is_title_bout     1   170
0  is_title_bout     1   171
0  is_title_bout     1   191
0  is_title_bout     1   242
