In [93]:
import pandas as pd
import re

In [94]:
df = pd.read_csv("comments_data.csv")

In [95]:
df = df[['Comment']].dropna()

In [96]:
def clean_text(text):
    text = str(text)

    # Remove broken unicode artifacts, control chars, replacement chars
    text = re.sub(r'[\x00-\x1F\x7Fï¿½]', '', text)

    # Remove URL
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove mentions, keep hashtag text but remove '#'
    text = re.sub(r'@[\w_]+', '', text)
    text = re.sub(r'#', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text.strip()

def preprocess(text):
    text = clean_text(text)
    return text.lower()

def preprocess_csv(input_path, output_path, comment_column="Comment"):
    # Load CSV
    df = pd.read_csv(input_path, encoding="utf-8")
    print(f"Loaded {len(df)} rows from '{input_path}'")

    # Apply preprocessing
    df["Preprocessed_comment"] = df[comment_column].apply(preprocess)

    # Remove duplicate cleaned comments
    df = df.drop_duplicates(subset="Preprocessed_comment", keep="first")

    # Remove empty / NaN comments
    df['Preprocessed_comment'].replace('', pd.NA, inplace=True)
    df.dropna(subset=['Preprocessed_comment'], inplace=True)
    
    # drop unused column
    cols_to_drop = ["Comment", "Timestamp"]
    df = df.drop(columns=cols_to_drop)

    # Save result
    df.to_csv(output_path, index=False, encoding="utf-8")
    print(f"Saved preprocessed file as: {output_path}")
    print(f"Total after deduplication & cleaning: {len(df)}")

    return df


In [97]:
df_cleaned = preprocess_csv(
    input_path="comments_data.csv",
    output_path="comments_preprocessed.csv",
    comment_column="Comment"   # ganti jika nama kolom berbeda
)

Loaded 670 rows from 'comments_data.csv'
Saved preprocessed file as: comments_preprocessed.csv
Total after deduplication & cleaning: 667


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Preprocessed_comment'].replace('', pd.NA, inplace=True)
