In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk

# Download NLTK tokenizer (sekali saja)
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ikramalghiffari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv("complete.csv")

print("Original Shape:", df.shape)
display(df.head())

# Dataset info
df.info()

# Ringkasan statistik
display(df.describe(include="all"))

Original Shape: (1980, 7)


Unnamed: 0,no,title,author,image_url,url,body,date
0,0,TNI AD Siapkan 12.000 Prajurit untuk Pasukan P...,Baharudin Al Farisi,https://asset.kompas.com/crops/in9S3T6WZntpUXz...,https://nasional.kompas.com/read/2025/11/26/09...,"JAKARTA, KOMPAS.com -Tentara Nasional Indonesi...","Kompas.com, 26 November 2025, 09:04 WIB"
1,1,3 Kapal Rumah Sakit TNI AL Siap Bertolak ke Gaza,Baharudin Al Farisi,https://asset.kompas.com/crops/plaL8r3pP55h4WJ...,https://nasional.kompas.com/read/2025/11/26/08...,"JAKARTA, KOMPAS.com- Kepala Dinas Penerangan A...","Kompas.com, 26 November 2025, 08:47 WIB"
2,2,Apa Itu Batalyon Bantuan dalam Brigade Komposi...,Baharudin Al Farisi,https://asset.kompas.com/crops/G3vPnR-R9vvh8KF...,https://nasional.kompas.com/read/2025/11/26/06...,"JAKARTA, KOMPAS.com- Panglima TNI Jenderal Agu...","Kompas.com, 26 November 2025, 06:17 WIB"
3,3,TNI Telah Siapkan Prajurit dan Alutsista untuk...,Novianti Setuningsih,https://asset.kompas.com/crops/KKh_s-0t6aRM2kk...,https://nasional.kompas.com/read/2025/11/25/22...,"JAKARTA, KOMPAS.com- Personel gabungan dari TN...","Kompas.com, 25 November 2025, 22:25 WIB"
4,4,TNI Kantongi Sejumlah Kandidat Komandan Pasuka...,Novianti Setuningsih,https://asset.kompas.com/crops/KKh_s-0t6aRM2kk...,https://nasional.kompas.com/read/2025/11/25/15...,"JAKARTA, KOMPAS.com- Markas Besar (Mabes) TNI ...","Kompas.com, 25 November 2025, 15:14 WIB"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   no         1980 non-null   int64 
 1   title      1980 non-null   object
 2   author     1837 non-null   object
 3   image_url  1980 non-null   object
 4   url        1980 non-null   object
 5   body       1980 non-null   object
 6   date       1980 non-null   object
dtypes: int64(1), object(6)
memory usage: 108.4+ KB


Unnamed: 0,no,title,author,image_url,url,body,date
count,1980.0,1980,1837,1980,1980,1980,1980
unique,,1980,108,1274,1980,1980,1975
top,,TNI AD Siapkan 12.000 Prajurit untuk Pasukan P...,Albertus Adit,https://asset.kompas.com/crops/Mft698jXKkJxOii...,https://nasional.kompas.com/read/2025/11/26/09...,"JAKARTA, KOMPAS.com -Tentara Nasional Indonesi...","Kompas.com, 15 November 2025, 17:30 WIB"
freq,,1,618,17,1,1,2
mean,989.5,,,,,,
std,571.721086,,,,,,
min,0.0,,,,,,
25%,494.75,,,,,,
50%,989.5,,,,,,
75%,1484.25,,,,,,


In [3]:
# Cek missing values awal
print("\nMissing values before cleaning:")
display(df.isnull().sum())

# Hapus duplikasi
df.drop_duplicates(inplace=True)
print("\nShape after removing duplicates:", df.shape)

# Drop rows tanpa title atau body (kolom penting)
df = df.dropna(subset=["title", "body"])

# Isi kolom non-kritis
df["author"] = df["author"].fillna("Unknown")
df["image_url"] = df["image_url"].fillna("")
df["url"] = df["url"].fillna("")
df["date"] = df["date"].fillna("")

print("\nMissing values after cleaning:")
display(df.isnull().sum())


Missing values before cleaning:


no             0
title          0
author       143
image_url      0
url            0
body           0
date           0
dtype: int64


Shape after removing duplicates: (1980, 7)

Missing values after cleaning:


no           0
title        0
author       0
image_url    0
url          0
body         0
date         0
dtype: int64

In [4]:
# --- Pisah berdasarkan koma ---
df["tanggal"] = df["date"].str.split(",").str[1].str.strip()
df["waktu"]   = df["date"].str.split(",").str[2].str.replace("WIB", "").str.strip()

# Pastikan waktu dalam format HH:MM (jam satu digit ditambah leading zero)
df["waktu"] = df["waktu"].apply(lambda x: ("0" + x) if re.match(r"^\d:\d{2}$", str(x)) else x)

# --- Gabungkan kembali ---
df["datetime_str"] = df["tanggal"] + " " + df["waktu"]

# --- Parse date ---
df["parsed_date"] = pd.to_datetime(
    df["datetime_str"],
    format="%d %B %Y %H:%M",
    errors="coerce"
)

print("\nParsed date preview:")
display(df[["date", "tanggal", "waktu", "parsed_date"]].head())


Parsed date preview:


Unnamed: 0,date,tanggal,waktu,parsed_date
0,"Kompas.com, 26 November 2025, 09:04 WIB",26 November 2025,09:04,2025-11-26 09:04:00
1,"Kompas.com, 26 November 2025, 08:47 WIB",26 November 2025,08:47,2025-11-26 08:47:00
2,"Kompas.com, 26 November 2025, 06:17 WIB",26 November 2025,06:17,2025-11-26 06:17:00
3,"Kompas.com, 25 November 2025, 22:25 WIB",25 November 2025,22:25,2025-11-25 22:25:00
4,"Kompas.com, 25 November 2025, 15:14 WIB",25 November 2025,15:14,2025-11-25 15:14:00


In [5]:
def clean_text(text):
    if pd.isna(text):
        return ""

    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Keep only letters
    text = re.sub(r"[^a-zA-Z\s]", " ", text)

    # Normalize multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

df["clean_title"] = df["title"].apply(clean_text)
df["clean_body"] = df["body"].apply(clean_text)

print("\nCleaned text samples:")
display(df[["title", "clean_title", "clean_body"]].head())


Cleaned text samples:


Unnamed: 0,title,clean_title,clean_body
0,TNI AD Siapkan 12.000 Prajurit untuk Pasukan P...,tni ad siapkan prajurit untuk pasukan perdamai...,jakarta kompas com tentara nasional indonesia ...
1,3 Kapal Rumah Sakit TNI AL Siap Bertolak ke Gaza,kapal rumah sakit tni al siap bertolak ke gaza,jakarta kompas com kepala dinas penerangan ang...
2,Apa Itu Batalyon Bantuan dalam Brigade Komposi...,apa itu batalyon bantuan dalam brigade komposi...,jakarta kompas com panglima tni jenderal agus ...
3,TNI Telah Siapkan Prajurit dan Alutsista untuk...,tni telah siapkan prajurit dan alutsista untuk...,jakarta kompas com personel gabungan dari tni ...
4,TNI Kantongi Sejumlah Kandidat Komandan Pasuka...,tni kantongi sejumlah kandidat komandan pasuka...,jakarta kompas com markas besar mabes tni meng...


In [6]:
df["tokens"] = df["clean_body"].apply(nltk.word_tokenize)

print("Token samples:")
display(df["tokens"].head())

Token samples:


0    [jakarta, kompas, com, tentara, nasional, indo...
1    [jakarta, kompas, com, kepala, dinas, penerang...
2    [jakarta, kompas, com, panglima, tni, jenderal...
3    [jakarta, kompas, com, personel, gabungan, dar...
4    [jakarta, kompas, com, markas, besar, mabes, t...
Name: tokens, dtype: object

In [7]:
df.to_csv("complete_cleaned.csv", index=False)
print("\nSaved as complete_cleaned.csv")


Saved as complete_cleaned.csv
