In [None]:
import pandas as pd
import numpy as np

# =============================================================================
# STEP 1: INITIAL DATA LOADING
# =============================================================================
# --- TASK: Load the raw user review data ---
# Reads the Excel file containing the raw user reviews and LIWC analysis into a Pandas DataFrame.
df = pd.read_excel('../data/UserReviewsClean43LIWC.xlsx')

# Creates a copy of the original DataFrame to ensure we work on a clean, modifiable version of the data.
df_clean = df.copy()


# =============================================================================
# STEP 2: CLEANING TEXT COLUMNS
# =============================================================================
# --- OPERATION: Clean the 'reviewer' column ---
# Converts the column to text, removes unnecessary quote marks and leading/trailing whitespace, and finally replaces the text string 'nan' with the standard numerical missing value (NaN).
df_clean['reviewer'] = df_clean['reviewer'].astype(str).str.replace(" '", "").str.replace("'", "").str.strip().replace('nan', np.nan)

# --- OPERATION: Clean the 'dateP' column ---
# Applies the same cleaning steps (remove quotes and strip whitespace) to the date text column.
df_clean['dateP'] = df_clean['dateP'].astype(str).str.replace(" '", "").str.replace("'", "").str.strip().replace('nan', np.nan)

# --- OPERATION: Clean the 'Rev' (Review Text) column ---
# Applies the same cleaning steps to the main review body text.
df_clean['Rev'] = df_clean['Rev'].astype(str).str.replace(" '", "").str.replace("'", "").str.strip().replace('nan', np.nan)


# =============================================================================
# STEP 3: FEATURE EXTRACTION AND TYPE CONVERSION
# =============================================================================
# --- OPERATION: Extract movie title from URL ---
# The URL column often contains the movie title at the very end (after the last slash). 
# We split the URL string by the slash ('/') character and take the last element to create a new **'movie_title'** column.
df_clean['movie_title'] = df_clean['url'].str.split('/').str[-1]

# --- OPERATION: Convert date to datetime format ---
# Converts the cleaned date text column into the proper Pandas **datetime** format. 
# The 'errors='coerce'' parameter ensures any dates that couldn't be parsed are converted to a missing value (NaT - Not a Time).
df_clean['dateP'] = pd.to_datetime(df_clean['dateP'], errors='coerce')


# =============================================================================
# STEP 4: SAVE CLEANED DATA
# =============================================================================
# --- OPERATION: Save the full cleaned dataset ---
# The final, clean DataFrame is saved to the specified output file location in **CSV** format.
df_clean.to_csv("../cleanedData/reviews_data_cleaned.csv", index=False)

# --- OPERATION: Save a small sample of the data ---
# Saves the first 50 rows of the cleaned data as a separate sample file, which is useful for quick testing and verification.
df_clean.head(50).to_csv("../cleanedData/reviews_data_sample_50.csv", index=False)

print("Data cleaning complete.")

Data cleaning complete.
