In [1]:
import pandas as pd
df = pd.read_pickle("../dataframe/poker_dataframe.pkl")

In [40]:
df

Unnamed: 0,Game ID,File,Player SB,Player BB,Preflop actions,Flop actions,Turn actions,River actions,Flop,Turn,River,SB stack,BB stack,SB cards,BB cards,Folded pre
0,502874582,Arezzo-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,Oracool1,sirstadiljus,"[(f, 0)]",,,,,,,1.22,2.01,,,True
1,603604223,Asmara-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,ArcticWin,Ceaban,"[(c, 0.01), (k, 0)]","[(k, 0), (k, 0)]","[(k, 0), (k, 0)]","[(b, 0.04), (c, 0.04)]","[5c, 9c, 5s]",7h,2c,2,1,"[ Qh, Tc ]","[ 6h, 2h ]",False
2,603604231,Asmara-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,Ceaban,ArcticWin,"[(f, 0)]",,,,,,,1.94,1.06,,,True
3,603604236,Asmara-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,ArcticWin,Ceaban,"[(f, 0)]",,,,,,,1.95,1.05,,,True
4,603604241,Asmara-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,Ceaban,ArcticWin,"[(f, 0)]",,,,,,,1.94,1.06,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520806,553224385,Corrientes-0.01-0.02-USD-NoLimitHoldem-Pacific...,Sensei.Dbb,roghnov,"[(c, 0.01), (k, 0)]","[(k, 0), (b, 0.02), (f, 0)]",,,"[Kc, 5d, 5s]",,,1.02,0.98,,,False
520807,553224389,Corrientes-0.01-0.02-USD-NoLimitHoldem-Pacific...,roghnov,Sensei.Dbb,"[(r, 0.03), (f, 0)]",,,,,,,1,1,,,False
520808,553224821,Corrientes-0.01-0.02-USD-NoLimitHoldem-Pacific...,mikvell,John_Efapi,"[(c, 0.01), (k, 0)]","[(b, 0.04), (c, 0.04)]","[(k, 0), (b, 0.08), (f, 0)]",,"[9h, 6h, 7h]",Ks,,3.36,1.90,,,False
520809,553224834,Corrientes-0.01-0.02-USD-NoLimitHoldem-Pacific...,John_Efapi,mikvell,"[(r, 0.05), (c, 0.04)]","[(k, 0), (k, 0)]","[(k, 0), (k, 0)]","[(k, 0), (k, 0)]","[9d, Qc, 5d]",7d,Ac,3.42,1.84,"[ 3d, 3s ]","[ 4h, Ah ]",False


In [31]:
def check_dataframe(df):
    """
    This function acts as quality control for our dataframe. 
    It flags rows that have unexpected inputs by running a lot of tests.

    Parameters:
    df (pd.dataframe): .pkl of the dataframe we want to test. 

    Returns:
    error_dict (dict): A dictionary of bugged rows. The keys are the indeces and the values are short message
                with the description of the problem.

    Raises:
    Value Error: requires certain columns to be present in the dataframe. 
                 If the columns are missing it will raise a Value Error.
                 
    required_columns = ['Preflop actions', 'Flop actions', 'Turn actions', 'River actions',
                        'Flop', 'Turn', 'River', 'SB stack', 'BB stack', 'SB cards', 'BB cards']

    Example:
    The function is called once on the dataframe. 

        >>> bugged_rows = check_dataframe(dataframe)
        >>> print(bugged_rows)

    Notes:
    The tests that we run are commented in the code
    
    """
    required_columns = ['Preflop actions', 'Flop actions', 'Turn actions', 'River actions',
                        'Flop', 'Turn', 'River', 'SB stack', 'BB stack', 'SB cards', 'BB cards']

    # Check if all required columns are in the dataframe
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

    
    error_dict = {}
    for index, row in df.iterrows():
        errors = []

        # Check if columns are lists or None type
        if not isinstance(row['Preflop actions'], list) and row['Preflop actions'] is not None:
            errors.append("Preflop actions is not a list or None.")
        if not isinstance(row['Flop actions'], list) and row['Flop actions'] is not None:
            errors.append("Flop actions is not a list or None.")
        if not isinstance(row['Turn actions'], list) and row['Turn actions'] is not None:
            errors.append("Turn actions is not a list or None.")
        if not isinstance(row['River actions'], list) and row['River actions'] is not None:
            errors.append("River actions is not a list or None.")
        if not isinstance(row['Flop'], list) and row['Flop'] is not None:
            errors.append("Flop is not a list or None.")

        # Check if turn and river are strings of length 2 or None
        if not ((isinstance(row['Turn'], str) and len(row['Turn']) == 2) or row['Turn'] is None):
            errors.append("Turn is not a string of length 2 or None.")
        if not ((isinstance(row['River'], str) and len(row['River']) == 2) or row['River'] is None):
            errors.append("River is not a string of length 2 or None.")

        # Check if SB cards start with "[" and end with "]" or are None
        sb_cards_valid = (isinstance(row['SB cards'], str) and row['SB cards'].startswith('[') and 
            row['SB cards'].endswith(']')) or row['SB cards'] is None
        if not sb_cards_valid:
            errors.append("SB cards do not start with '[' and end with ']' or are not None.")

        # Check if BB cards start with "[" and end with "]" or are None
        bb_cards_valid = (isinstance(row['BB cards'], str) and row['BB cards'].startswith('[') and 
            row['BB cards'].endswith(']')) or row['BB cards'] is None
        if not bb_cards_valid:
            errors.append("BB cards do not start with '[' and end with ']' or are not None.")
  
        # Check if SB stack and BB stack can be converted to float
        try:
            float(row['SB stack'])
        except ValueError:
            errors.append("SB stack can't be converted to float.")
        try:
            float(row['BB stack'])
        except ValueError:
            errors.append("BB stack can't be converted to float.")

        if errors:
            error_dict[index] = '; '.join(errors)

    return error_dict


In [32]:
bugged_rows = check_dataframe(df)
for index, error_message in bugged_rows.items():
    print(f"Row {index}: {error_message}")


Row 280872: SB cards do not start with '[' and end with ']' or are not None.; BB cards do not start with '[' and end with ']' or are not None.
Row 280873: SB cards do not start with '[' and end with ']' or are not None.; BB cards do not start with '[' and end with ']' or are not None.
Row 280876: SB cards do not start with '[' and end with ']' or are not None.; BB cards do not start with '[' and end with ']' or are not None.
Row 280878: SB cards do not start with '[' and end with ']' or are not None.; BB cards do not start with '[' and end with ']' or are not None.


In [34]:
keys = list(bugged_rows.keys())
print("Keys:", keys)

Keys: [280872, 280873, 280876, 280878]


In [37]:
def drop_rows_from_df(df, error_dict):
    """
    Drops rows from a dataframe based on a provided dictionary of errors.

    The function takes a dataframe and a dictionary, where the dictionary's keys 
    are row indices that indicate which rows in the dataframe have errors. These rows 
    are removed from the dataframe, and the index of the dataframe is reset.

    Parameters:
    df (pd.DataFrame): The dataframe from which rows will be dropped.
    error_dict (dict): A dictionary with row indices as keys. The values are error messages, 
                       but only the keys are used to determine which rows to drop.

    Returns:
    pd.DataFrame: A new dataframe with specified rows dropped and the index reset.

    Example:
    >>> error_dict = {280872: "Error message", 280873: "Another error"}
    >>> new_df = drop_rows_from_df(original_df, error_dict)
    """
    # Extract the row indices (keys) from the error dictionary
    rows_to_drop = list(error_dict.keys())

    # Drop the specified rows and reset the index
    new_df = df.drop(rows_to_drop).reset_index(drop=True)

    return new_df

# Example usage
# Assuming 'df' is your original dataframe and 'error_dict' is your dictionary of errors
# new_df = drop_rows_from_df(df, error_dict)


In [41]:
df_without_bugs=drop_rows_from_df(df, bugged_rows)

In [42]:
df_without_bugs

Unnamed: 0,Game ID,File,Player SB,Player BB,Preflop actions,Flop actions,Turn actions,River actions,Flop,Turn,River,SB stack,BB stack,SB cards,BB cards,Folded pre
0,502874582,Arezzo-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,Oracool1,sirstadiljus,"[(f, 0)]",,,,,,,1.22,2.01,,,True
1,603604223,Asmara-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,ArcticWin,Ceaban,"[(c, 0.01), (k, 0)]","[(k, 0), (k, 0)]","[(k, 0), (k, 0)]","[(b, 0.04), (c, 0.04)]","[5c, 9c, 5s]",7h,2c,2,1,"[ Qh, Tc ]","[ 6h, 2h ]",False
2,603604231,Asmara-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,Ceaban,ArcticWin,"[(f, 0)]",,,,,,,1.94,1.06,,,True
3,603604236,Asmara-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,ArcticWin,Ceaban,"[(f, 0)]",,,,,,,1.95,1.05,,,True
4,603604241,Asmara-0.01-0.02-USD-NoLimitHoldem-Pacific-6-4...,Ceaban,ArcticWin,"[(f, 0)]",,,,,,,1.94,1.06,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520802,553224385,Corrientes-0.01-0.02-USD-NoLimitHoldem-Pacific...,Sensei.Dbb,roghnov,"[(c, 0.01), (k, 0)]","[(k, 0), (b, 0.02), (f, 0)]",,,"[Kc, 5d, 5s]",,,1.02,0.98,,,False
520803,553224389,Corrientes-0.01-0.02-USD-NoLimitHoldem-Pacific...,roghnov,Sensei.Dbb,"[(r, 0.03), (f, 0)]",,,,,,,1,1,,,False
520804,553224821,Corrientes-0.01-0.02-USD-NoLimitHoldem-Pacific...,mikvell,John_Efapi,"[(c, 0.01), (k, 0)]","[(b, 0.04), (c, 0.04)]","[(k, 0), (b, 0.08), (f, 0)]",,"[9h, 6h, 7h]",Ks,,3.36,1.90,,,False
520805,553224834,Corrientes-0.01-0.02-USD-NoLimitHoldem-Pacific...,John_Efapi,mikvell,"[(r, 0.05), (c, 0.04)]","[(k, 0), (k, 0)]","[(k, 0), (k, 0)]","[(k, 0), (k, 0)]","[9d, Qc, 5d]",7d,Ac,3.42,1.84,"[ 3d, 3s ]","[ 4h, Ah ]",False


In [43]:
df_without_bugs.to_pickle('poker_dataframe_bugfixed.pkl')