In [2]:
import numpy as np
import pandas as pd
import ast
from itertools import combinations
import matplotlib.pyplot as plt
import requests

#### Step 1: GitHub API and CSV Files:

In [2]:
# GitHub API URL
url = 'https://api.github.com/repos/Razelbaz1/Primacy-and-Recency-Bias/contents/Data'
# Checking access to the site
response = requests.get(url)
data = response.json()

# Creating a list of relevant links
csv_files = [file['download_url'] for file in data if file['name'].endswith('.csv')]


In [3]:
response

<Response [200]>

#### Step 2: Define Helper Functions

- `parse_list_rank(row)`: Converts the string representation of a list of tuples into an actual list of tuples.
- `extract_numeric(group_id)`: Extracts the numeric part from a group ID (e.g., `a3` -> `3`).
- `calculate_lag(group1, group2)`: Computes the absolute difference between the numeric parts of two group IDs.

In [3]:
# Function to parse list_rank column
def parse_list_rank(row):
    return ast.literal_eval(row)

# Function to extract numeric part from group ID
def extract_numeric(group_id):
    return int(group_id[1:])

# Function to calculate lag values
def calculate_lag(group1, group2):
    return abs(extract_numeric(group1) - extract_numeric(group2))

#### Step 3: Process Each CSV File

Initialize an empty list `all_combined_dfs` to store the processed DataFrames.

For each CSV file:
1. **Load the CSV File:** Load the CSV file into a DataFrame.
2. **Parse the `list_rank` Column:** Convert the string representation of the list of tuples into an actual list of tuples.
3. **Extract Participant ID and Rankings:** Iterate through each row of the data, extract the `username` and `parsed_rank` information, and create a list of dictionaries with `username`, `group`, and `score`.
4. **Create a DataFrame from the Parsed Data:** Convert the list of dictionaries into a DataFrame `parsed_df`.

In [6]:
all_combined_dfs = []

# Process each CSV file
for session_id, csv_url in enumerate(csv_files, start=1):
    # Load the CSV file
    data = pd.read_csv(csv_url)
    
    # Parse the list_rank column
    data['parsed_rank'] = data['list_rank'].apply(parse_list_rank)

    # Extract participant ID and their rankings
    parsed_data = []

    for _, row in data.iterrows():
        username = row['username']
        ranks = row['parsed_rank']
        
        for group, score in ranks:
            parsed_data.append({
                'username': username,
                'group': group,
                'score': score
            })

    # Create a DataFrame from the parsed data
    parsed_df = pd.DataFrame(parsed_data)

#### Step 4: Calculate Lag Values and Determine Bias

Iterate through each `username` and group the data by `score`. For each score group:

1. **Get All Combinations of Pairs:** Generate all possible pairs of groups with the same score.
2. **Calculate Lag Values:** Calculate the lag values between each pair.
3. **Determine Primacy or Recency Bias:** Determine whether the comparison indicates `primacy` or `recency` based on the numeric order of the groups.
4. **Append Results:** Append the results to the `lag_data` list.

Finally, Create a DataFrame `lag_df` from the `lag_data` list.

In [7]:
# Find pairs with the same score and calculate lag
lag_data = []

for username, group_df in parsed_df.groupby('username'):
    same_score_groups = group_df.groupby('score')
        
    for score, score_df in same_score_groups:
        # Get all combinations of pairs
        for row1, row2 in combinations(score_df.itertuples(index=False), 2):
            lag_value = calculate_lag(row1.group, row2.group)
                
            # Determine primacy or recency
            if extract_numeric(row1.group) > extract_numeric(row2.group):
                bias = 'recency'
            else:
                bias = 'primacy'
                
            lag_data.append({
                'username': username,
                'group1': row1.group,
                'group2': row2.group,
                'score': score,
                'lag': lag_value,
                'bias': bias
            })

# Create a DataFrame for the lag data
lag_df = pd.DataFrame(lag_data)

#### Step 5: Split Bias Column and Add session_id

Split the `bias` column into two separate columns, `primacy` and `recency`, and set the values to 1 or 0 accordingly.

Create a final DataFrame `final_df` with the relevant columns: 

`username`, `score`, `lag`, `primacy`, and `recency`. 

And add a `session_id` column to the DataFrame.

In [8]:
# Split the bias column into 'primacy' and 'recency' columns
lag_df['primacy'] = lag_df['bias'].apply(lambda x: 1 if x == 'primacy' else 0)
lag_df['recency'] = lag_df['bias'].apply(lambda x: 1 if x == 'recency' else 0)

# Final table structure
final_df = lag_df[['username', 'score', 'lag', 'primacy', 'recency']].copy()
final_df.loc[:, 'session_id'] = session_id  # Add the session_id column

#### Step 6: Group and Aggregate Data

Group the final DataFrame by `username`, `score`, `lag`, and `session_id`.

Aggregate the `primacy` and `recency` columns by summing their values. 

Then, reset the index of the resulting DataFrame and append it to the `all_combined_dfs` list.

In [9]:
# Group by 'username', 'score', 'lag', and 'session_id' and aggregate 'primacy' and 'recency' columns
combined_df = final_df.groupby(['username', 'score', 'lag', 'session_id']).agg({
    'primacy': 'sum',
    'recency': 'sum'
}).reset_index()

all_combined_dfs.append(combined_df)

#### Step 7: Concatenate All Combined DataFrames

Concatenate all DataFrames stored in the `all_combined_dfs` list into a single DataFrame `final_combined_df`,
ignoring the index. Finally, we display the first few rows of the final combined DataFrame.

In [10]:
# Concatenate all combined DataFrames
final_combined_df = pd.concat(all_combined_dfs, ignore_index=True)

In [11]:
final_combined_df.shape

(298, 6)