In [None]:
# Imports and Configuration
import pandas as pd
import os

# Configuration
INPUT_FILE = "./data/movie_logs.csv"
OUTPUT_FILE = "./data/matching_data_rate_test.csv"
CHUNKSIZE = 1000  # Smaller chunk size for testing
TEST_ROWS = 10000  # Number of rows to test


In [None]:
def process_chunk(chunk):
    """Process a chunk of data and return transformed DataFrames for Data and Rate entries"""
    data_entries = []
    rate_entries = []
    
    for _, row in chunk.iterrows():
        try:
            req = row["request"]
            
            if req.startswith("GET /data/"):
                # Process Data entry
                parts = req.split("/data/")[1].split("/")
                if len(parts) < 3:
                    continue
                
                movie_id = parts[1].replace("+", " ")
                minutes = parts[2].split(".")[0]
                
                data_entries.append({
                    "timestamp": row["timestamp"],
                    "userID": row["request_id"],
                    "logtype": "Data",
                    "movie_title": movie_id,
                    "minutes": minutes,
                    "rating": None
                })
                
            elif req.startswith("GET /rate/"):
                # Process Rate entry
                rate_part = req.split("/rate/")[1]
                movie_rating = rate_part.split("=")
                if len(movie_rating) != 2:
                    continue
                
                movie_id = movie_rating[0].replace("+", " ")
                
                rate_entries.append({
                    "timestamp": row["timestamp"],
                    "userID": row["request_id"],
                    "logtype": "Rate",
                    "movie_title": movie_id,
                    "minutes": None,
                    "rating": movie_rating[1]
                })
                
        except Exception as e:
            print(f"Error processing row: {e}")
            continue
    
    return pd.DataFrame(data_entries), pd.DataFrame(rate_entries)


In [8]:
# Initialize accumulators
all_data = []
all_rates = []

# Process data in chunks, limiting to TEST_ROWS
for chunk in pd.read_csv(INPUT_FILE, header=None, 
                        names=["timestamp", "request_id", "request"],
                        chunksize=CHUNKSIZE, nrows=TEST_ROWS):
    data_df, rate_df = process_chunk(chunk)
    all_data.append(data_df)
    all_rates.append(rate_df)

# Combine all chunks
final_data = pd.concat(all_data, ignore_index=True)
final_rates = pd.concat(all_rates, ignore_index=True)

# Merge Data and Rate entries
merged_df = pd.merge(final_data, final_rates, 
                    on="movie_title", 
                    suffixes=("_data", "_rate"))

# Preview the results
print("Preview of processed data:")
print(final_data.head())
print("\nPreview of processed rates:")
print(final_rates.head())
print("\nPreview of merged data:")
print(merged_df.head())


Preview of processed data:
             timestamp  userID logtype                    movie_title minutes  \
0  2025-02-28T03:36:47  122156    Data                 star wars 1977       6   
1  2025-02-28T03:36:47  242890    Data           the five senses 1999     100   
2  2025-02-28T03:36:47  235684    Data              clara and me 2004      64   
3  2025-02-28T03:36:47  159468    Data  the shawshank redemption 1994      64   
4  2025-02-28T03:36:47  103729    Data                   tangled 2010      66   

  rating  
0   None  
1   None  
2   None  
3   None  
4   None  

Preview of processed rates:
             timestamp  userID logtype                         movie_title  \
0  2025-02-28T03:36:48  265143    Rate  rare exports a christmas tale 2010   
1  2025-02-28T03:36:49  284982    Rate                       far away 2001   
2  2025-02-28T03:36:49  301905    Rate      the princess and the frog 2009   
3  2025-02-28T03:36:50  104416    Rate                  soul assassin 2001   
4

In [9]:
# Create output directory if needed
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

# Save results
merged_df.to_csv(OUTPUT_FILE, index=False)
print(f"Processed {len(final_data)} Data entries and {len(final_rates)} Rate entries")
print(f"Matching records saved to: {OUTPUT_FILE}")


Processed 9926 Data entries and 74 Rate entries
Matching records saved to: ./data/matching_data_rate_test.csv
