# Self Practice 1 - Data Scraping

___

## Material

- Pandas
- Beautifulsoup
- kaggle

___

## Import Library

In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pytz
from kaggle.api.kaggle_api_extended import KaggleApi
import os

## Pandas

In [10]:
# Read local CSV file
data = pd.read_csv('../data/Data_Nasabah.csv', delimiter=';')

# Display the first 5 rows of the dataset
print(data.head(5))

  nasabah_id  umur jenis_kelamin  pendapatan  saldo_rata_rata  \
0       N001    22     Perempuan     5800000          1508000   
1       N002    64     Perempuan     5700000          1254000   
2       N003    27     Perempuan     2950000           590000   
3       N004    34     Perempuan     3100000           186000   
4       N005    45     Laki-Laki     6700000          1474000   

   jumlah_transaksi  jenis_produk  frekuensi_kunjungi_cabang  \
0                19      tabungan                          1   
1                 9  kartu_kredit                          2   
2                12      tabungan                          1   
3                16      deposito                          5   
4                15  kartu_kredit                          3   

  pengguna_mobile_banking  skor_kredit  
0                   TIDAK          900  
1                   TIDAK          900  
2                      YA          500  
3                   TIDAK          700  
4                  

## BeautifulSoup

In [11]:
# Set custom User-Agent header to mimic a normal web browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/114.0.0.0 Safari/537.36'
}

In [12]:
# Part 1: Scrape Kompas.com main page
url_main = 'https://www.kompas.com/'

response = requests.get(url_main, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    berita_utama = soup.find_all('h3', class_='most__title')

    # Extract article titles
    judul_berita = [berita.text.strip() for berita in berita_utama]

    # Store into a DataFrame
    df_berita = pd.DataFrame({'Judul Berita': judul_berita})

    print("Article Titles from the Main Page:")
    print(df_berita)
else:
    print(f'Failed to access URL. Status code: {response.status_code}')

Article Titles from the Main Page:
Empty DataFrame
Columns: [Judul Berita]
Index: []


In [13]:
# Part 2: Scrape a specific Kompas.com article page
url_article = 'https://www.kompas.com/global/read/2025/04/09/123149070/china-akan-larang-semua-film-dari-as-balas-tarif-impor-104-persen-trump'

response = requests.get(url_article, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract article title
    content_title = soup.find('h1').text.strip()

    # Extract publication date
    content_published_date = soup.find('meta', property='article:published_time')
    if content_published_date:
        utc_time = datetime.fromisoformat(content_published_date['content'].replace('Z', '+00:00'))
        wib_time = utc_time.astimezone(pytz.timezone('Asia/Jakarta'))
        published_date = wib_time.strftime('%d/%m/%Y, %H:%M WIB')
    else:
        published_date = "Publication date not found"

    # Extract article tags
    content_tags = soup.find('meta', {'name': 'keywords'})
    tags = content_tags['content'] if content_tags else "Tags not found"

    print("\nArticle Details:")
    print("Title:", content_title)
    print("Published Date:", published_date)
    print("Tags:", tags)

else:
    print(f'Failed to access URL. Status code: {response.status_code}')


Article Details:
Title: China Akan Larang Semua Film dari AS, Balas Tarif Impor 104 Persen Trump
Published Date: 09/04/2025, 12:31 WIB
Tags: tarif impor Amerika, tarif impor Trump, tarif impor as, Kementerian Luar Negeri China, Beijing, Amerika Serikat, Tarif impor Trump


## Kaggle

In [14]:
# Kaggle: Download dataset using API
api = KaggleApi()
api.authenticate()

In [15]:
dataset = 'juhibhojani/house-price'
download_path = '../data'

# Download and unzip dataset
api.dataset_download_files(dataset, path=download_path, unzip=True)
print(f"Dataset successfully downloaded and extracted to folder: {download_path}")

Dataset URL: https://www.kaggle.com/datasets/juhibhojani/house-price
Dataset successfully downloaded and extracted to folder: ../data


In [16]:
# List files in the download directory
files = os.listdir(download_path)
print("Contents of the folder:")
for f in files:
    print(f)

Contents of the folder:
Data_Nasabah.csv
house_prices.csv
