<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/src/create_csv_for_mtruk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re

# Step 1: Load 'mturk_qualification.tsv' and shuffle
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/experiment/training-main-1020.tsv'
try:
    text_df = pd.read_csv(file_path, sep='\t')
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the path.")
    exit()

text_df = text_df.sample(frac=1, random_state=42)

# Step 2: Remove excess whitespace from the 'body' column
print("Cleaning whitespace from the 'body' column...")
text_df['body'] = text_df['body'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x.strip()))
print("Whitespace cleaning complete.")


# Step 3: Save the cleaned and shuffled DataFrame
output_file_path = '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/training-main-1020_shuffled.tsv'
try:
    text_df.to_csv(output_file_path, sep='\t', index=False)
    print(f"DataFrame was successfully saved to '{output_file_path}'")
except Exception as e:
    print(f"Error saving the file: {e}")

# Step 4: Extract the 'body' column and rename it as 'text' for further processing
output_df = text_df[['body']].rename(columns={'body': 'text'})

# Step 5: Divide into the top 200 records and the rest
if len(output_df) < 200:
    print("Warning: The total number of records is less than 200.")
    print("All records will be saved to the 'top 200' file, and the 'rest' file will be empty.")
    top_200_df = output_df.copy()
    rest_df = pd.DataFrame(columns=output_df.columns) # Empty dataframe with same columns
else:
    top_200_df = output_df.head(200)
    rest_df = output_df.iloc[200:]

# Step 6: Store each file
top_200_file_path = '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/training-main-top200.csv'
rest_file_path = '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/training-main-rest.csv'

top_200_df.to_csv(top_200_file_path, index=False)
rest_df.to_csv(rest_file_path, index=False)

# Step 7: Print information about the created files
print("Cleaned Data Information:")
print(f"First 5 rows of the top 200 records (from '{top_200_file_path}'):")
print(top_200_df.head(5))
print(f"Total number of rows in top 200 records file: {len(top_200_df)}")

print("\nFirst 5 rows of the rest of the records (from '{rest_file_path}'):")
if not rest_df.empty:
    print(rest_df.head(5))
else:
    print("No remaining records.")
print(f"Total number of rows in the rest of the records file: {len(rest_df)}")

print(f"\nTotal number of rows processed: {len(output_df)}")
print(f" Processing completed. Files created: \n   1. '{top_200_file_path}' \n   2. '{rest_file_path}'")

🧹 Cleaning whitespace from the 'body' column...
✅ Whitespace cleaning complete.
✅ DataFrame was successfully saved to '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/training-main-1020_shuffled.tsv'

📊 Cleaned Data Information:
First 5 rows of the top 200 records (from '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/training-main-top200.csv'):
                                                  text
523  porsche 944. 24-36mpg, way less complex system...
602  The other day when I was parking my '97 Black ...
526  I was originally going to use the 2.8 and driv...
31   Definitely pay for the lawyer's services. Make...
616  Hahaha. It's the salt, butter, and preservativ...
Total number of rows in top 200 records file: 200

First 5 rows of the rest of the records (from '{rest_file_path}'):
                                                  text
449  I thought this would be helpful for anyone who...
783  Hey reddit, my friends and I were looking to t...

In [None]:
import pandas as pd
import re

# Step 1: Load the TSV file and shuffle its contents.
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk_qualification.tsv'
try:
    text_df = pd.read_csv(file_path, sep='\t')
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the path.")
    exit()

text_df = text_df.sample(frac=1, random_state=42)

# Step 2: Clean the 'body' column.
print("Cleaning whitespace from the 'body' column...")
text_df['body'] = text_df['body'].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x.strip()))
print("Whitespace cleaning complete.")


# Step 3: Save the cleaned and shuffled DataFrame.
output_shuffled_path = '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/mturk_qualification_shuffled.tsv'
try:
    text_df.to_csv(output_shuffled_path, sep='\t', index=False)
    print(f"Full shuffled DataFrame was successfully saved to '{output_shuffled_path}'")
except Exception as e:
    print(f"Error saving the shuffled file: {e}")


# Step 4: Extract the 'body' column for the final output.
output_df = text_df[['body']].rename(columns={'body': 'text'})


# Step 5: Save the final single-column data to a single CSV file.
output_final_path = '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/mturk_qualification.csv'
try:
    output_df.to_csv(output_final_path, index=False)
    print(f"Final single-column DataFrame was successfully saved to '{output_final_path}'")
except Exception as e:
    print(f"Error saving the final file: {e}")


# Step 6: Print information about the created files.
print("\n--- Processing Summary ---")
print(f"1. Full shuffled data saved to: '{output_shuffled_path}'")
print(f"2. Final processed data saved to: '{output_final_path}'")

print("\nFinal Processed Data Preview:")
print(f"First 5 rows of the final records (from '{output_final_path}'):")
print(output_df.head(5))
print(f"Total number of rows processed: {len(output_df)}")

print("\nProcessing completed.")

Cleaning whitespace from the 'body' column...
Whitespace cleaning complete.
Full shuffled DataFrame was successfully saved to '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/mturk_qualification_shuffled.tsv'
Final single-column DataFrame was successfully saved to '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/mturk_qualification.csv'

--- Processing Summary ---
1. Full shuffled data saved to: '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/mturk_qualification_shuffled.tsv'
2. Final processed data saved to: '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/mturk_qualification.csv'

Final Processed Data Preview:
First 5 rows of the final records (from '/content/drive/MyDrive/world-inflation/data/reddit/experiment/mturk/mturk_qualification.csv'):
                                                 text
9   $2000 is a great price... most of the RTW tick...
11  I've always gone from Boston to Dublin/Madrid ...
0