In [3]:
import pandas as pd
from textblob import TextBlob

df = pd.read_csv("../data/raw/reviews_raw.csv")
print("Original:", len(df))

# Keep everything except null platform/date
df_clean = df[df["Platform"].notna()]
df_clean["at"] = pd.to_datetime(df_clean["at"], errors="coerce")
df_clean.dropna(subset=["at"], inplace=True)

df_clean.to_csv("../data/processed/reviews_cleaned.csv", index=False)
print("Cleaned (all rows kept):", len(df_clean))

# Now filter only text reviews for Sentiment
df_text = df_clean[df_clean["content"].notna()].copy()

def get_sentiment(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return 0.0

df_text['sentiment'] = df_text['content'].apply(get_sentiment)

df_text.to_csv("../data/processed/reviews_sentiment.csv", index=False)
print("Sentiment rows:", len(df_text))
df_text.head()


Original: 200000
Cleaned (all rows kept): 200000
Sentiment rows: 200000


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,Platform,sentiment
0,ce504a3d-c0c8-4805-8056-57390226a883,Ayinke olamide Olamide,https://play-lh.googleusercontent.com/a/ACg8oc...,Good morning guy,5,0,436.0.0.35.101,2025-11-21 11:37:46,,,436.0.0.35.101,Facebook,0.7
1,3ce8ee38-28e9-4ba1-9abd-e56a7e72d752,Evelyn Rider,https://play-lh.googleusercontent.com/a-/ALV-U...,I love my cousin very very much so don't f*** ...,5,0,539.0.0.54.69,2025-11-21 11:37:06,,,539.0.0.54.69,Facebook,0.486667
2,220c5153-2641-4233-a551-b3a13b5a3bf0,Shubham Kaithwas,https://play-lh.googleusercontent.com/a/ACg8oc...,it is very good,3,0,538.0.0.53.70,2025-11-21 11:37:04,,,538.0.0.53.70,Facebook,0.91
3,a30eed13-a35f-4be7-a4d1-1cb9cdd117cd,Saleem Ullahgj,https://play-lh.googleusercontent.com/a/ACg8oc...,good,5,0,539.0.0.54.69,2025-11-21 11:36:06,,,539.0.0.54.69,Facebook,0.7
4,6be67d09-b640-4493-a1e0-4a5ccbae521b,Nilesh Patel,https://play-lh.googleusercontent.com/a-/ALV-U...,nice app üåπüëåüëç,5,0,539.0.0.54.69,2025-11-21 11:35:35,,,539.0.0.54.69,Facebook,0.6
