In [37]:

# ================================================================
# ---------------------- TASK 4 ----------------------------------
# MOVIE REVIEWS
# ================================================================

print("\n==================== TASK 4: MOVIE REVIEWS CLEANING ====================")

movie_df = pd.read_csv("/content/movie_reviews-1.csv")

print("\n--- BEFORE ---")
print(movie_df.head())

# Clean HTML + lowercase
def clean_review(text):
    if pd.isna(text):
        return ""
    text = BeautifulSoup(text, "html.parser").get_text()
    return text.lower()

movie_df["clean_review"] = movie_df["review_text"].apply(clean_review)

# Tokenize + TF-IDF
tfidf = TfidfVectorizer(stop_words="english", max_features=500)
tfidf_matrix = tfidf.fit_transform(movie_df["clean_review"])

# Rating → fill missing
movie_df["rating"] = movie_df["rating"].fillna(movie_df["rating"].median())

# Normalize 0–10 → 0–1
movie_df["rating_norm"] = movie_df["rating"] / 10

print("\n--- AFTER ---")
print(movie_df.head())

# TESTS
assert movie_df["clean_review"].isna().sum() == 0
assert movie_df["rating_norm"].max() <= 1
assert tfidf_matrix.shape[0] == len(movie_df)

print("\nTask 4 Passed All Tests ✔")

print("\n==================== ALL TASKS COMPLETED SUCCESSFULLY ====================\n")



--- BEFORE ---
   review_id                review_text  rating
0          1      <p>Amazing movie!</p>     8.0
1          2  Terrible acting & plot!!!     2.0
2          3      <p>Amazing movie!</p>     NaN
3          4  Terrible acting & plot!!!     8.0
4          5      <p>Amazing movie!</p>     5.0

--- AFTER ---
   review_id                review_text  rating               clean_review  \
0          1      <p>Amazing movie!</p>     8.0             amazing movie!   
1          2  Terrible acting & plot!!!     2.0  terrible acting & plot!!!   
2          3      <p>Amazing movie!</p>     8.0             amazing movie!   
3          4  Terrible acting & plot!!!     8.0  terrible acting & plot!!!   
4          5      <p>Amazing movie!</p>     5.0             amazing movie!   

   rating_norm  
0          0.8  
1          0.2  
2          0.8  
3          0.8  
4          0.5  

Task 4 Passed All Tests ✔


