This notebook applies analyses from Antoniak et al. (2021) to our dataset. 

Antoniak, M., Walsh, M., & Mimno, D. (2021). Tags, Borders, and Catalogs: Social Re-Working of Genre on LibraryThing. Proc. ACM Hum.-Comput. Interact., 5(CSCW1), 29:1-29:29. https://doi.org/10.1145/3449103


1. Ask GPT-4o to select genre-related tags and combine the tags based on expression similarities;
   - Saved in "../Data/Tags/GPT-4_Based_Combined_Tags_DataFrame.csv"
2. Human inspection and further merge the tags **UNFINISHED**;
   - Saved in "../Data/Tags/tag_merged.csv"
   - "Tag" is the tag name that represent the row of tags, and "Combined_Tags" are the original tags that are merged together. All "Tag(s)" are picked from "Combined_Tags". 
3. Calculate frequency of each Tag based on tag frequency in tag_counter.csv
4. Calculate each Tag's mean rating and mean number of ratings
   - Mean rating: The mean rating of the reviews that tagged Combined_Tags;
   - Mean number of ratings: The mean of reviews that are tagged with Combined_Tags;

In [29]:
import pandas as pd
import os
import ast

In [72]:
mergedPath = "../Data/Tags/tag_merged.csv"
merged = pd.read_csv(mergedPath)
merged["Combined_Tags"] = merged["Combined_Tags"].str.split(", ")
merged

Unnamed: 0,Tag,Combined_Tags
0,graphic-novels,"[graphic-novels, graphic-novel, graphic-novels..."
1,graphic-novel-comics,"[graphic-novels-manga, graphic-novel-comics, f..."
2,adult-graphic-novels,"[adult-graphic-novels, graphic-novel-adult, gr..."
3,ya-or-adult-graphic-novels,"[adult-teen-graphic-novels, ya-or-adult-graphi..."
4,kids-graphic-novels,"[kids-graphic-novels, graphic-novel-kids, chil..."
...,...,...
75,format-graphic,"[format-graphic, 1format-graphic, form-graphic]"
76,cartoon-graphic-novels,"[cartoon-graphic-novel, cartoon-graphic-novels]"
77,autobiographic-novels,[autobiographic-novels]
78,literary-fiction,[literary-fiction]


In [45]:
tagCounterPath = "../Data/Tags/tag_counter.csv"
tagCounter = pd.read_csv(tagCounterPath)

In [73]:
# Remove duplicates in the Combined_Tags column
def remove_duplicates(tags_list):
    return list(set(tags_list))

# Apply the function to the Combined_Tags column
merged['Combined_Tags'] = merged['Combined_Tags'].apply(remove_duplicates)

In [74]:
# Count frequency of each Tag

# Initialize the Frequency column to 0
merged["Frequency"] = 0

# Iterate over each row in the merged DataFrame
for idx, row in merged.iterrows():
    freq = 0
    # Iterate over each tag in the Combined_Tags list
    for tag in row["Combined_Tags"]:
        # Sum the frequencies of the tags from the tagCounter DataFrame
        tag_freq = tagCounter[tagCounter["tag"] == tag]["frequency"]
        if not tag_freq.empty:
            freq += tag_freq.values[0]
    # Update the Frequency column for the current row
    merged.at[idx, "Frequency"] = freq

In [75]:
merged.sort_values(by="Frequency", ascending=False, inplace=True)
merged

Unnamed: 0,Tag,Combined_Tags,Frequency
0,graphic-novels,"[graphic-novel-2021, graphic-novel-collection,...",5692
10,non-fiction,"[2-non-fiction, mg-nonfiction, nonfiction-read...",2124
9,biography,"[biographical-fiction, nf-biography, biography...",1523
11,history,"[aa-history, r-nf-history, history-general, nf...",1076
1,graphic-novel-comics,"[comics-manga-and-graphic-novels, graphic-nove...",1016
...,...,...,...
30,lgbt-nonfiction,[glbt-nonfiction],1
8,great-graphic-novels,[great-graphic-novels],1
42,labor-history,[labor-history],1
50,graphic-art,[graphic-art],1


In [76]:
merged.to_csv("../Data/Tags/tag_merged.csv", index=False)

In [100]:
import os
import pandas as pd
from collections import defaultdict
import ast

merged = pd.read_csv("../Data/Tags/tag_merged.csv")

reviewsPath = "../Data/Reviews_Scraped/En/"
reviewFiles = os.listdir(reviewsPath)

# Preload all reviews into memory
all_reviews = []

for file in reviewFiles:
    df = pd.read_csv(os.path.join(reviewsPath, file))
    
    # Ensure rating is a numeric value, convert non-numeric ratings to NaN
    df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
    
    # Ensure shelves column is a list
    df["shelves"] = df["shelves"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    all_reviews.append(df)

# Combine all reviews into a single DataFrame with new global indexing
all_reviews_df = pd.concat(all_reviews, ignore_index=True)

# Filter all_reviews_df to only include reviews with valid ratings to avoid multiple filtering operations
valid_reviews_df = all_reviews_df[all_reviews_df["rating"].notna()]

# Create a reverse index for tags using the new global index
tag_index = defaultdict(set)  # Key: tag, Value: set of indices of reviews containing the tag

for idx, row in all_reviews_df.iterrows():
    if isinstance(row["shelves"], list):
        for tag in row["shelves"]:
            tag_index[tag].add(idx)

# 确保 Combined_Tags 解析为列表类型
def parse_combined_tags(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)  # Safely parse the string representation of a list
        except (ValueError, SyntaxError):
            print(f"Error parsing Combined_Tags: {x}")
            return []
    return x if isinstance(x, list) else []

merged["Combined_Tags"] = merged["Combined_Tags"].apply(parse_combined_tags)

# Initiate the Number_of_Ratings and Mean_Rating columns
merged["Number_of_Ratings"] = 0
merged["Mean_Rating"] = 0

# Iterate over each row in the merged DataFrame
for idx, row in merged.iterrows():
    unique_reviews = set()  # Set to track unique reviews that have been counted
    num_ratings = 0
    total_rating = 0
    print(f"Processing row {idx} with tag {row['Tag']}")

    # Extract all tags from the Combined_Tags column
    combined_tags = row["Combined_Tags"]

    # make sure combined_tags is a list
    if not isinstance(combined_tags, list):
        print(f"Warning: Combined_Tags at row {idx} is not a list, skipping...")
        continue
    
    # Find all reviews that contain any of the tags in the combined_tags list by using the tag_index
    for tag in combined_tags:
        if tag in tag_index:
            # Get the indices of reviews that contain the tag by using the tag_index
            matching_indices = tag_index[tag]
            
            # Add these indices to the set of unique reviews
            unique_reviews.update(matching_indices)

    # Debug print: Check unique reviews set size and content
    print(f"Unique reviews for row {idx}: {len(unique_reviews)} reviews.")

    # Filter out the unique reviews from all_reviews_df, then filter for valid ratings
    filtered_reviews = all_reviews_df.loc[list(unique_reviews)]
    valid_filtered_reviews = filtered_reviews[filtered_reviews["rating"].notna()]

    # 计算评分和数量
    num_ratings = len(valid_filtered_reviews)
    total_rating = valid_filtered_reviews["rating"].sum()

    if num_ratings != 0:
        merged.at[idx, "Number_of_Ratings"] = num_ratings
        merged.at[idx, "Mean_Rating"] = total_rating / num_ratings
        print(f"  Number of Ratings: {merged.at[idx, 'Number_of_Ratings']}")
        print(f"  Mean Rating: {merged.at[idx, 'Mean_Rating']}")
    else:
        print(f"  No ratings found for row {idx}")

# Display the final merged DataFrame for verification
merged


Processing row 0 with tag graphic-novels
Unique reviews for row 0: 5685 reviews.
  Number of Ratings: 5558
  Mean Rating: 4.1214465635120545
Processing row 1 with tag non-fiction
Unique reviews for row 1: 2121 reviews.
  Number of Ratings: 2052
  Mean Rating: 4.110623781676413
Processing row 2 with tag biography
Unique reviews for row 2: 1514 reviews.
  Number of Ratings: 1492
  Mean Rating: 4.02680965147453
Processing row 3 with tag history
Unique reviews for row 3: 1076 reviews.
  Number of Ratings: 1057
  Mean Rating: 4.276253547776727
Processing row 4 with tag graphic-novel-comics
Unique reviews for row 4: 1014 reviews.
  Number of Ratings: 988
  Mean Rating: 4.075910931174089
Processing row 5 with tag memoir
Unique reviews for row 5: 638 reviews.
  Number of Ratings: 625
  Mean Rating: 4.408
Processing row 6 with tag biography-memoirs
Unique reviews for row 6: 527 reviews.
  Number of Ratings: 510
  Mean Rating: 4.1117647058823525
Processing row 7 with tag graphic
Unique reviews f

  merged.at[idx, "Mean_Rating"] = total_rating / num_ratings


Unnamed: 0,Tag,Combined_Tags,Frequency,Number_of_Ratings,Mean_Rating,Mean_Number_of_Ratings
0,graphic-novels,"[graphic-novel-2021, graphic-novel-collection,...",5692,5558,4.121447,26.46
1,non-fiction,"[2-non-fiction, mg-nonfiction, nonfiction-read...",2124,2052,4.110624,19.42
2,biography,"[biographical-fiction, nf-biography, biography...",1523,1492,4.026810,69.10
3,history,"[aa-history, r-nf-history, history-general, nf...",1076,1057,4.276254,53.21
4,graphic-novel-comics,"[comics-manga-and-graphic-novels, graphic-nove...",1016,988,4.075911,4.98
...,...,...,...,...,...,...
75,lgbt-nonfiction,[glbt-nonfiction],1,1,5.000000,1.00
76,great-graphic-novels,[great-graphic-novels],1,0,0.000000,1.00
77,labor-history,[labor-history],1,1,3.000000,1.00
78,graphic-art,[graphic-art],1,1,5.000000,1.00


In [101]:
merged["Mean_Number_of_Ratings"] = (merged["Number_of_Ratings"] / merged["Combined_Tags"].apply(len)).round(2)
merged["Mean_Rating"] = merged["Mean_Rating"].round(2)
merged

Unnamed: 0,Tag,Combined_Tags,Frequency,Number_of_Ratings,Mean_Rating,Mean_Number_of_Ratings
0,graphic-novels,"[graphic-novel-2021, graphic-novel-collection,...",5692,5558,4.12,61.76
1,non-fiction,"[2-non-fiction, mg-nonfiction, nonfiction-read...",2124,2052,4.11,31.57
2,biography,"[biographical-fiction, nf-biography, biography...",1523,1492,4.03,149.20
3,history,"[aa-history, r-nf-history, history-general, nf...",1076,1057,4.28,75.50
4,graphic-novel-comics,"[comics-manga-and-graphic-novels, graphic-nove...",1016,988,4.08,7.72
...,...,...,...,...,...,...
75,lgbt-nonfiction,[glbt-nonfiction],1,1,5.00,1.00
76,great-graphic-novels,[great-graphic-novels],1,0,0.00,0.00
77,labor-history,[labor-history],1,1,3.00,1.00
78,graphic-art,[graphic-art],1,1,5.00,1.00


In [102]:
merged.to_csv("../Data/Tags/tag_merged.csv", index=False)