In [19]:
import pandas as pd
import re
from datetime import datetime

In [4]:
df = pd.read_csv('detikcom_unprocessed_news_data.csv', sep='\t')

# Display the first few rows
df.head()

Unnamed: 0,article_title,article_author,article_publication_date,article_content
0,"Diresmikan Jokowi, PTPP Rampungkan Proyek Jala...",Hana Nushratu - detikFinance,"Minggu, 20 Okt 2024 23:55 WIB",PT PP (Persero) Tbk sebagai salah satu BUMN Ko...
1,Agenda Jokowi Usai Pulang ke Solo: Tidur,Tara Wahyu NV - detikJateng,"Minggu, 20 Okt 2024 22:33 WIB","Presiden ke-7 RI, Joko Widodo (Jokowi) mengaku..."
2,Kepulangan Jokowi ke Solo Disambut Warga hingg...,Hana Nushratu - detikNews,"Minggu, 20 Okt 2024 23:12 WIB",Presiden Republik Indonesia periode 2014-2024 ...
3,Disambut Antusias Masyarakat Saat Pulang ke So...,Tara Wahyu NV - detikJateng,"Minggu, 20 Okt 2024 22:28 WIB",Presiden ke-7 Republik Indonesia (RI) Joko Wid...
4,Sejumlah Calon Kepala Daerah Bertemu Jokowi di...,Tara Wahyu NV - detikJateng,"Minggu, 20 Okt 2024 22:09 WIB",Sejumlah calon kepala daerah bertemu Presiden ...


In [5]:
# Display the the number of rows x columns
df.shape

(706, 4)

In [6]:
# Summary statistics
df.describe()

Unnamed: 0,article_title,article_author,article_publication_date,article_content
count,667,667,667,654
unique,654,161,648,640
top,Video Said Didu soal Jokowi Makan Malam Bareng...,Eva Safitri - detikNews,"Rabu, 16 Okt 2024 21:00 WIB",(nkm/nkm)
freq,2,107,3,3


In [7]:
# Information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 706 entries, 0 to 705
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   article_title             667 non-null    object
 1   article_author            667 non-null    object
 2   article_publication_date  667 non-null    object
 3   article_content           654 non-null    object
dtypes: object(4)
memory usage: 22.2+ KB


In [8]:
# Check for missing values
df.isnull().sum()

article_title               39
article_author              39
article_publication_date    39
article_content             52
dtype: int64

In [9]:
# Drop row the the missing value found
df.dropna(inplace=True)

df.isnull().sum()

article_title               0
article_author              0
article_publication_date    0
article_content             0
dtype: int64

In [10]:
# Check for duplicates
df.duplicated().sum()

np.int64(12)

In [11]:
# Remove duplicates
df.drop_duplicates(inplace=True)

df.duplicated().sum()

np.int64(0)

In [12]:
# Check row where article_title column not contain Jokowi keyword
(~df['article_title'].str.contains('Jokowi', na=False)).sum()

np.int64(3)

In [13]:
# Remove row where artilce_title column not contain Jokowi keyword
df = df[df['article_title'].str.contains('Jokowi', na=False)]

(~df['article_title'].str.contains('Jokowi', na=False)).sum()

np.int64(0)

In [23]:
# Function to clean and format the column `article_publication_date`
def clean_and_format_article_publication_date(publication_date):
    if isinstance(publication_date, str):
        # Using regex to extract the part with the date and time
        match = re.search(r'([A-Za-z]+, \d{1,2} [A-Za-z]{3} \d{4} \d{2}:\d{2} WIB)', publication_date)
        if match:
            # Extract the date string
            date_str = match.group(0)
            
            # Map Indonesian weekdays and months to English
            ind_to_eng = {
                "Senin": "Monday", "Selasa": "Tuesday", "Rabu": "Wednesday",
                "Kamis": "Thursday", "Jumat": "Friday", "Sabtu": "Saturday", "Minggu": "Sunday",
                "Jan": "Jan", "Feb": "Feb", "Mar": "Mar", "Apr": "Apr", "Mei": "May",
                "Jun": "Jun", "Jul": "Jul", "Agu": "Aug", "Sep": "Sep", "Okt": "Oct",
                "Nov": "Nov", "Des": "Dec"
            }
            
            # Replace Indonesian weekdays and months with English equivalents
            for ind, eng in ind_to_eng.items():
                date_str = date_str.replace(ind, eng)
            
            # Parse the modified date string
            date_obj = datetime.strptime(date_str, "%A, %d %b %Y %H:%M WIB")
            
            # Format the datetime object to 'DD-MM-YYYY'
            return date_obj.strftime("%d-%m-%Y")
    return publication_date  # If no date is found, return the original value

df['article_publication_date'] = df['article_publication_date'].apply(clean_and_format_article_publication_date)
df.head()

Unnamed: 0,article_title,article_author,article_publication_date,article_content
0,"Diresmikan Jokowi, PTPP Rampungkan Proyek Jala...",Hana Nushratu - detikFinance,20-10-2024,PT PP (Persero) Tbk sebagai salah satu BUMN Ko...
1,Agenda Jokowi Usai Pulang ke Solo: Tidur,Tara Wahyu NV - detikJateng,20-10-2024,"Presiden ke-7 RI, Joko Widodo (Jokowi) mengaku..."
2,Kepulangan Jokowi ke Solo Disambut Warga hingg...,Hana Nushratu - detikNews,20-10-2024,Presiden Republik Indonesia periode 2014-2024 ...
3,Disambut Antusias Masyarakat Saat Pulang ke So...,Tara Wahyu NV - detikJateng,20-10-2024,Presiden ke-7 Republik Indonesia (RI) Joko Wid...
4,Sejumlah Calon Kepala Daerah Bertemu Jokowi di...,Tara Wahyu NV - detikJateng,20-10-2024,Sejumlah calon kepala daerah bertemu Presiden ...


In [24]:
# Function to clean the column `article_author`
def clean_article_author(author):
    if isinstance(author, str):
        return author.split('-')[0].strip()  # Extract only the part before the '-' sign
    return author

df['article_author'] = df['article_author'].apply(clean_article_author)
df.head()

Unnamed: 0,article_title,article_author,article_publication_date,article_content
0,"Diresmikan Jokowi, PTPP Rampungkan Proyek Jala...",Hana Nushratu,20-10-2024,PT PP (Persero) Tbk sebagai salah satu BUMN Ko...
1,Agenda Jokowi Usai Pulang ke Solo: Tidur,Tara Wahyu NV,20-10-2024,"Presiden ke-7 RI, Joko Widodo (Jokowi) mengaku..."
2,Kepulangan Jokowi ke Solo Disambut Warga hingg...,Hana Nushratu,20-10-2024,Presiden Republik Indonesia periode 2014-2024 ...
3,Disambut Antusias Masyarakat Saat Pulang ke So...,Tara Wahyu NV,20-10-2024,Presiden ke-7 Republik Indonesia (RI) Joko Wid...
4,Sejumlah Calon Kepala Daerah Bertemu Jokowi di...,Tara Wahyu NV,20-10-2024,Sejumlah calon kepala daerah bertemu Presiden ...


In [25]:
# Function to clean the `article_content` column from ads and unnecessary content
def clean_article_content(content):
    if isinstance(content, str):
        # Remove text related to images
        content = re.sub(r"\[Gambas:.*?\]", "", content)
        # Remove the 'Watch Video' section
        content = re.sub(r"Lihat Video '.*?':", "", content)
        # Remove the 'Also Watch Video' section
        content = re.sub(r"Simak juga Video '.*?':", "", content)
    return content

df['article_content'] = df['article_content'].apply(clean_article_content)
df.head()

Unnamed: 0,article_title,article_author,article_publication_date,article_content
0,"Diresmikan Jokowi, PTPP Rampungkan Proyek Jala...",Hana Nushratu,20-10-2024,PT PP (Persero) Tbk sebagai salah satu BUMN Ko...
1,Agenda Jokowi Usai Pulang ke Solo: Tidur,Tara Wahyu NV,20-10-2024,"Presiden ke-7 RI, Joko Widodo (Jokowi) mengaku..."
2,Kepulangan Jokowi ke Solo Disambut Warga hingg...,Hana Nushratu,20-10-2024,Presiden Republik Indonesia periode 2014-2024 ...
3,Disambut Antusias Masyarakat Saat Pulang ke So...,Tara Wahyu NV,20-10-2024,Presiden ke-7 Republik Indonesia (RI) Joko Wid...
4,Sejumlah Calon Kepala Daerah Bertemu Jokowi di...,Tara Wahyu NV,20-10-2024,Sejumlah calon kepala daerah bertemu Presiden ...


In [26]:
# Display the the number of rows x columns after preprocessing
df.shape

(639, 4)

In [27]:
# Information about the dataset after preprocessing
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 639 entries, 0 to 705
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   article_title             639 non-null    object
 1   article_author            639 non-null    object
 2   article_publication_date  639 non-null    object
 3   article_content           639 non-null    object
dtypes: object(4)
memory usage: 25.0+ KB


In [28]:
# Display the first few rows after preprocessing
df.head()

Unnamed: 0,article_title,article_author,article_publication_date,article_content
0,"Diresmikan Jokowi, PTPP Rampungkan Proyek Jala...",Hana Nushratu,20-10-2024,PT PP (Persero) Tbk sebagai salah satu BUMN Ko...
1,Agenda Jokowi Usai Pulang ke Solo: Tidur,Tara Wahyu NV,20-10-2024,"Presiden ke-7 RI, Joko Widodo (Jokowi) mengaku..."
2,Kepulangan Jokowi ke Solo Disambut Warga hingg...,Hana Nushratu,20-10-2024,Presiden Republik Indonesia periode 2014-2024 ...
3,Disambut Antusias Masyarakat Saat Pulang ke So...,Tara Wahyu NV,20-10-2024,Presiden ke-7 Republik Indonesia (RI) Joko Wid...
4,Sejumlah Calon Kepala Daerah Bertemu Jokowi di...,Tara Wahyu NV,20-10-2024,Sejumlah calon kepala daerah bertemu Presiden ...


In [29]:
# Save preprocessing data to csv
try:    
    df.to_csv('detikcom_preprocessed_news_data.csv', sep='\t', encoding='utf-8', index=False, header=True)
    print('Success saved article data to csv')
except Exception as e:
    print(f'Error occured while saved article data to csv: {e}')

Success saved article data to csv
