### Cleaning Variable Names & Dropping Irrelevant Variables

In [None]:
import pandas as pd

In [None]:
file_path = r"/Users/patriciasuratno/Documents/epp_project/ClimateRiskResourceBehavior/src/cprs/data/all_apps_wide_2024-01-18.csv"
df = pd.read_csv(file_path)

In [None]:
df

In [None]:
# Iterate over the columns in the dataframe
for column in df.columns:
    # Replace dots with underscores in the column name
    new_column = column.replace(".", "_")
    # Rename the column in the dataframe
    df = df.rename(columns={column: new_column})

In [None]:
# Create PLAYER_NUM and set it to the row number
df["PLAYER_NUM"] = df.index + 1

In [None]:
# Create LAB_SESSION based on PLAYER_NUM
df["LAB_SESSION"] = pd.cut(
    df["PLAYER_NUM"],
    bins=[0, 24, 48, float("inf")],
    labels=[1, 2, 3],
)

In [None]:
df

In [None]:
list_of_columns = df.columns.tolist()
print(list_of_columns)

In [None]:
# Create GROUPID_ALL based on cs2_forest5groupid_in_subsession and LAB_SESSION
df["GROUPID_ALL"] = df["CS2_Forest_5_group_id_in_subsession"]
df.loc[df["LAB_SESSION"] == 2, "GROUPID_ALL"] += 8
df.loc[df["LAB_SESSION"] == 3, "GROUPID_ALL"] += 16

In [None]:
# Define the label mappings
lab_session_labels = {1: "Lab Session 1", 2: "Lab Session 2", 3: "Lab Session 3"}

groupid_all_labels = {
    1: "Group 1",
    2: "Group 2",
    3: "Group 3",
    4: "Group 4",
    5: "Group 5",
    6: "Group 6",
    7: "Group 7",
    8: "Group 8",
    9: "Group 9",
    10: "Group 10",
    11: "Group 11",
    12: "Group 12",
    13: "Group 13",
    14: "Group 14",
    15: "Group 15",
    16: "Group 16",
    17: "Group 17",
    18: "Group 18",
    19: "Group 19",
    20: "Group 20",
    21: "Group 21",
    22: "Group 22",
    23: "Group 23",
    24: "Group 24",
}

In [None]:
# Replace numeric values with labels
df["LAB_SESSION"] = df["LAB_SESSION"].replace(lab_session_labels)
df["GROUPID_ALL"] = df["GROUPID_ALL"].replace(groupid_all_labels)

In [None]:
# Renaming variables for clarity
df = df.rename(
    columns={
        "PLAYER_NUM": "player's number in order of lab session",
        "LAB_SESSION": "Lab Session Number 1, 2 or 3",
        "GROUPID_ALL": "group id in all lab session",
    },
)

In [None]:
# Reorder columns
df = df[
    [
        "player's number in order of lab session",
        "Lab Session Number 1, 2 or 3",
        "group id in all lab session",
    ]
    + [
        col
        for col in df.columns
        if col
        not in [
            "player's number in order of lab session",
            "Lab Session Number 1, 2 or 3",
            "group id in all lab session",
        ]
    ]
]

In [None]:
# Renaming variables for better readability
participant_vars = [col for col in df.columns if col.startswith("participant")]
for var in participant_vars:
    new_var_name = "p_" + var[12:]  # Remove the first 11 characters and prepend 'p_'
    df = df.rename(columns={var: new_var_name})

columns_to_drop = [
    "p__is_bot",
    "p__index_in_pages",
    "p__max_page_index",
    "p__current_app_name",
    "p__current_page_name",
    "p_mturk_worker_id",
    "p_mturk_assignment_id",
    "p_group_id",
    "p_covid_okay",
    "p_ceiling_group_take",
    "p_take_ceiling",
]
df = df.drop(columns=columns_to_drop, errors="ignore")

In [None]:
# Renaming session level variables
# Step 1: Get all column names starting with "session"
session_vars = [col for col in df.columns if col.startswith("session")]

# Step 2: Rename these variables
for var in session_vars:
    new_var_name = "s_" + var[8:]  # Remove the first 7 characters and prepend 's_'
    df = df.rename(columns={var: new_var_name})

columns_to_drop = [
    "s_label",
    "s_mturk_hitid",
    "s_mturk_hitgroupid",
    "s_comment",
    "s_is_demo",
    "s_configreal_world_currency",
    "s_configparticipation_fee",
    "s_is_shock_group",
    "s_high_probability",
    "s_shock_probability_high",
    "s_shock_probability_low",
    "s_group_id",
    "s_ceiling_group_take",
]

# Drop the specified columns
df = df.drop(columns=columns_to_drop, errors="ignore")

In [None]:
# Renaming CS1 Intro
df = df.rename(columns={"cs1_intro1playerid_in_group": "player_cubicle"})

# Label for 'player_cubicle' (Note: Pandas does not store labels like Stata. This is just a comment for reference)
# "Player's cubicle number in Lab Session 1"

# Step 2: Drop specific variables
columns_to_drop = [
    "cs1_intro1playerrole",
    "cs1_intro1playercode",
    "cs1_intro1playerpayoff",
    "cs1_intro1groupid_in_subsession",
    "cs1_intro1subsessionround_number",
]
df = df.drop(columns=columns_to_drop, errors="ignore")

# Step 3: Rename a group of variables (cs1_intro1player*)
cs1_intro1player_vars = [
    col for col in df.columns if col.startswith("cs1_intro1player")
]
for var in cs1_intro1player_vars:
    new_var_name = "cs1_" + var[16:]
    df = df.rename(columns={var: new_var_name})

# Step 4: Rename another group of variables (cs1_compr*)
cs1_compr_vars = [col for col in df.columns if col.startswith("cs1_compr")]
for var in cs1_compr_vars:
    new_var_name = "cs2_compr" + var[9:]
    df = df.rename(columns={var: new_var_name})

# Step 5: Rename and label another variable
df = df.rename(columns={"cs1_num_failed_attem": "failed_attem1"})

In [None]:
# Renaming CS2 Forest

# Renaming variables that follow a pattern
for j in range(1, 6):
    cs2_forest_vars = [col for col in df.columns if col.startswith(f"cs2_forest{j}")]
    for var in cs2_forest_vars:
        new_var_name = "cs2_" + var[10:]
        df = df.rename(columns={var: new_var_name})

In [None]:
# Dropping specific variables
for i in range(1, 5):
    drop_vars = [
        f"cs2_{i}groupid_in_subsession",
        f"cs2_{i}playerrole",
        f"cs2_{i}playerid_in_group",
        f"cs2_{i}playerpayoff",
        f"cs2_{i}playerstage_points",
        f"cs2_{i}playerpotential_payof",
    ]
    df = df.drop(columns=drop_vars, errors="ignore")

In [None]:
# Additional specific variables to drop
df = df.drop(columns=["cs2_5playerrole", "cs2_5playerpayoff"], errors="ignore")

In [None]:
# Renaming certain variables
df = df.rename(
    columns={
        "cs2_5groupid_in_subsession": "groupid1",
        "cs2_5playerid_in_group": "memberid1",
    },
)

In [None]:
# Optional: Labeling variables (for documentation purposes)

# Reordering columns
order_columns = ["groupid1", "memberid1"] + [
    col for col in df.columns if col.startswith("cs2_1playertake")
]

In [None]:
list_of_columns = df.columns.tolist()
print(list_of_columns)

In [None]:
df