In [1]:
!pip install datasets pandas

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux201

In [6]:
from datasets import load_dataset
import pandas as pd

# Load the dataset
dataset = load_dataset("SALT-NLP/CultureBank")

# Let's examine both splits
print("Available splits:", dataset.keys())

# Let's look at the first row of each split
print("\nFirst row of TikTok data:")
print(dataset['tiktok'][0])
print("\nFirst row of Reddit data:")
print(dataset['reddit'][0])

# Create subsets of 1000 rows from each split
tiktok_subset = dataset['tiktok'].select(range(1000))
reddit_subset = dataset['reddit'].select(range(1000))

# Convert to pandas DataFrames
tiktok_df = pd.DataFrame(tiktok_subset)
reddit_df = pd.DataFrame(reddit_subset)

# Save the subsets
tiktok_df.to_csv('tiktok_subset.csv', index=False)
reddit_df.to_csv('reddit_subset.csv', index=False)

# Print some basic information
print("\nTikTok subset shape:", tiktok_df.shape)
print("Reddit subset shape:", reddit_df.shape)
print("\nTikTok columns:", tiktok_df.columns.tolist())
print("Reddit columns:", reddit_df.columns.tolist())

Available splits: dict_keys(['tiktok', 'reddit'])

First row of TikTok data:
{'cultural group': 'American', 'context': 'in public', 'goal': None, 'relation': None, 'actor': 'people', 'actor_behavior': 'dress casually, often in comfortable clothing, with a preference for sweatpants and following dress codes', 'recipient': None, 'recipient_behavior': None, 'other_descriptions': None, 'topic': 'Dress Codes', 'agreement': 0.9, 'num_support_bin': '[1060, 1070)', 'time_range': "{2021: '[180, 190)', 2022: '[600, 610)', 2023: '[270, 280)'}", 'eval_whole_desc': 'In public settings within American culture, it is common for people to dress casually, often opting for comfortable clothing such as sweatpants while still adhering to dress codes. This relaxed approach to attire is widely regarded as the norm by a significant portion of the sampled population. It reflects a preference for comfort and practicality in daily dress, showcasing a relaxed and informal attitude towards clothing choices in var

In [8]:
# Function to display data info in a cleaner way
def display_data_info(df, name):
    print(f"\n{'='*50}")
    print(f"{name} DATASET INFO")
    print(f"{'='*50}")
    print(f"Shape: {df.shape}")
    print("\nColumns:")
    for col in df.columns:
        print(f"- {col}")
    print("\nFirst row sample:")
    for col in df.columns:
        print(f"{col}: {df.iloc[0][col][:100]}..." if isinstance(df.iloc[0][col], str) and len(str(df.iloc[0][col])) > 100 else f"{col}: {df.iloc[0][col]}")

# Display info for both datasets
display_data_info(tiktok_df, "TIKTOK")
display_data_info(reddit_df, "REDDIT")


TIKTOK DATASET INFO
Shape: (1000, 17)

Columns:
- cultural group
- context
- goal
- relation
- actor
- actor_behavior
- recipient
- recipient_behavior
- other_descriptions
- topic
- agreement
- num_support_bin
- time_range
- eval_whole_desc
- eval_scenario
- eval_persona
- eval_question

First row sample:
cultural group: American
context: in public
goal: None
relation: None
actor: people
actor_behavior: dress casually, often in comfortable clothing, with a preference for sweatpants and following dress ...
recipient: None
recipient_behavior: None
other_descriptions: None
topic: Dress Codes
agreement: 0.9
num_support_bin: [1060, 1070)
time_range: {2021: '[180, 190)', 2022: '[600, 610)', 2023: '[270, 280)'}
eval_whole_desc: In public settings within American culture, it is common for people to dress casually, often opting ...
eval_scenario: Travel Advising
eval_persona: A business professional from a formal corporate background, planning a first-time trip to the United...
eval_question: 

In [12]:
import pandas as pd

# Read your TikTok data
tiktok_df = pd.read_csv('tiktok_subset.csv')

# Create a new DataFrame with the expected structure
new_df = pd.DataFrame({
    'vid': range(len(tiktok_df)),  # Create sequential IDs
    'comment_utc': [0] * len(tiktok_df),  # Add dummy timestamp
    'submission_title': tiktok_df['eval_scenario'],  # Map eval_scenario to submission_title
    'comment_content': tiktok_df['eval_whole_desc']  # Map eval_whole_desc to comment_content
})

# Save the new DataFrame
new_df.to_csv('tiktok_formatted.csv', index=False)

print("New DataFrame structure:")
print(new_df.head())

New DataFrame structure:
   vid  comment_utc submission_title  \
0    0            0  Travel Advising   
1    1            0  Travel Advising   
2    2            0  Travel Advising   
3    3            0  Travel Advising   
4    4            0  Travel Advising   

                                     comment_content  
0  In public settings within American culture, it...  
1  When Americans live or move to Europe, it is c...  
2  In American restaurants, it is customary for c...  
3  In the United States, tipping is a deeply ingr...  
4  In American homes and some public settings, it...  
