In [61]:
import sys
import os
sys.path.append('/content')  # Pastikan Colab bisa mengimpor dari folder /content
!mkdir -p utils tests
!touch utils/__init__.py tests/__init__.py

In [62]:
%%writefile utils/extract.py
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def extract_data():
    product_data = []

    for page in range(1, 51):  # Di Colab, batasi halaman untuk testing
        if page == 1:
          url = "https://fashion-studio.dicoding.dev"
        else:
          url = f"https://fashion-studio.dicoding.dev/page{page}"
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"[ERROR] Gagal mengambil halaman {page}: {e}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        product_cards = soup.find_all('div', class_='collection-card')

        if not product_cards:
            print(f"[INFO] Tidak ada produk pada halaman {page}.")
            break

        timestamp_now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        for product in product_cards:
            try:
                title = product.find('h3', class_='product-title').text.strip() if product.find('h3', class_='product-title') else 'N/A'
                price = product.find('span', class_='price').text.strip() if product.find('span', class_='price') else 'N/A'
                rating = product.find('p', style="font-size: 14px; color: #777;").text.strip().replace('Rating: ', '') if product.find('p', style="font-size: 14px; color: #777;") else 'N/A'
                color = product.find('p', string=lambda text: 'Color' in text if text else False).text.strip().replace('Color: ', '') if product.find('p', string=lambda text: 'Color' in text if text else False) else 'N/A'
                size = product.find('p', string=lambda text: 'Size' in text if text else False).text.strip().replace('Size: ', '') if product.find('p', string=lambda text: 'Size' in text if text else False) else 'N/A'
                gender = product.find('p', string=lambda text: 'Gender' in text if text else False).text.strip().replace('Gender: ', '') if product.find('p', string=lambda text: 'Gender' in text if text else False) else 'N/A'

                product_data.append({
                    'Title': title,
                    'Price': price,
                    'Rating': rating,
                    'Color': color,
                    'Size': size,
                    'Gender': gender,
                    'Timestamp': timestamp_now
                })
            except Exception as e:
                print(f"[WARNING] Gagal parsing produk di halaman {page}: {e}")
                continue

        time.sleep(1)

    return pd.DataFrame(product_data)

if __name__ == "__main__":
    df = extract_data()
    df.to_csv("extracted_product_data.csv", index=False)
    print(f"Total produk yang diambil: {len(df)}")

Writing utils/extract.py


In [63]:
!python3 utils/extract.py

Total produk yang diambil: 1000


In [64]:
import pandas as pd

df = pd.read_csv("extracted_product_data.csv")
df.head()


Unnamed: 0,Title,Price,Rating,Color,Size,Gender,Timestamp
0,Unknown Product,$100.00,⭐ Invalid Rating / 5,5 Colors,M,Men,2025-05-04 14:24:47
1,T-shirt 2,$102.15,⭐ 3.9 / 5,3 Colors,M,Women,2025-05-04 14:24:47
2,Hoodie 3,$496.88,⭐ 4.8 / 5,3 Colors,L,Unisex,2025-05-04 14:24:47
3,Pants 4,$467.31,⭐ 3.3 / 5,3 Colors,XL,Men,2025-05-04 14:24:47
4,Outerwear 5,$321.59,⭐ 3.5 / 5,3 Colors,XXL,Women,2025-05-04 14:24:47


In [65]:
%%writefile utils/transform.py
import pandas as pd

def transform_data(df_raw):
    df_raw.columns = df_raw.columns.str.strip().str.lower()
    cleaned_data = []

    for idx, record in df_raw.iterrows():
        try:
            # Validasi dan transformasi data
            title = str(record['title']).strip()
            price = float(str(record['price']).replace('$', '').replace(',', '').strip()) * 16000
            rating = float(str(record['rating']).split('/')[0].replace('⭐', '').strip())

            # PERBAIKAN: Tanda kurung yang benar untuk ekstraksi color
            color_str = str(record['color'])
            color = int(''.join(filter(str.isdigit, color_str))) if any(c.isdigit() for c in color_str) else 0

            size = str(record['size']).strip().upper()
            gender = str(record['gender']).strip().title()

            cleaned_data.append({
                'title': title,
                'price': price,
                'rating': rating,
                'color': color,
                'size': size,
                'gender': gender,
                'timestamp': pd.to_datetime(record['timestamp'])
            })
        except Exception as e:
            print(f"[WARNING] Error pada record {idx}: {str(e)}")
            continue

    return pd.DataFrame(cleaned_data).drop_duplicates().dropna()

if __name__ == "__main__":
    df_raw = pd.read_csv("extracted_product_data.csv")
    df_clean = transform_data(df_raw)
    df_clean.to_csv("cleaned_product_data.csv", index=False)
    print(f"Data berhasil ditransformasi: {len(df_clean)} record")

Writing utils/transform.py


In [66]:
!python3 utils/transform.py

Data berhasil ditransformasi: 867 record


In [67]:
import pandas as pd

df = pd.read_csv("cleaned_product_data.csv")
df.head()


Unnamed: 0,title,price,rating,color,size,gender,timestamp
0,T-shirt 2,1634400.0,3.9,3,M,Women,2025-05-04 14:24:47
1,Hoodie 3,7950080.0,4.8,3,L,Unisex,2025-05-04 14:24:47
2,Pants 4,7476960.0,3.3,3,XL,Men,2025-05-04 14:24:47
3,Outerwear 5,5145440.0,3.5,3,XXL,Women,2025-05-04 14:24:47
4,Jacket 6,2453920.0,3.3,3,S,Unisex,2025-05-04 14:24:47


In [68]:
%%writefile utils/load.py
import pandas as pd

def load_to_csv(df, output_path="final_product_data.csv"):
    try:
        df.to_csv(output_path, index=False)
        print(f"Data berhasil disimpan di: {output_path}")
        return True
    except Exception as e:
        print(f"Gagal menyimpan data: {e}")
        return False

if __name__ == "__main__":
    df = pd.read_csv("cleaned_product_data.csv")
    load_to_csv(df)



Writing utils/load.py


In [69]:
!python3 utils/load.py

Data berhasil disimpan di: final_product_data.csv


In [70]:
%%writefile main.py
import pandas as pd
from utils.extract import extract_data
from utils.transform import transform_data
from utils.load import load_to_csv
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('etl_pipeline.log'),
        logging.StreamHandler()
    ]
)

def run_etl_pipeline():
    """Main function to execute the ETL pipeline"""
    try:
        # Extraction phase
        logging.info("Starting data extraction...")
        start_time = datetime.now()

        raw_df = extract_data()
        if raw_df.empty:
            raise ValueError("No data was extracted")
        logging.info(f"Successfully extracted {len(raw_df)} records")

        # Transformation phase
        logging.info("Starting data transformation...")
        transformed_df = transform_data(raw_df)
        if transformed_df.empty:
            raise ValueError("No data after transformation")
        logging.info(f"Successfully transformed {len(transformed_df)} records")

        # Loading phase
        logging.info("Starting data loading...")
        if load_to_csv(transformed_df):
            logging.info("Data successfully loaded to CSV")
        else:
            raise RuntimeError("Failed to load data to CSV")

        end_time = datetime.now()
        duration = end_time - start_time
        logging.info(f"ETL pipeline completed successfully in {duration.total_seconds():.2f} seconds")

    except Exception as e:
        logging.error(f"ETL pipeline failed: {str(e)}")
        raise

if __name__ == "__main__":
    run_etl_pipeline()

Writing main.py


In [71]:
%%writefile tests/test_extract.py
import unittest
import pandas as pd
from utils.extract import extract_data

class TestExtract(unittest.TestCase):
    def test_extract_data_returns_dataframe(self):
        df = extract_data()
        self.assertIsInstance(df, pd.DataFrame)

    def test_extract_data_not_empty(self):
        df = extract_data()
        self.assertGreater(len(df), 0, "Data hasil ekstraksi tidak boleh kosong")


Writing tests/test_extract.py


In [72]:
%%writefile tests/test_transform.py
import unittest
import pandas as pd
from utils.extract import extract_data
from utils.transform import transform_data

class TestTransform(unittest.TestCase):
    def test_transform_data_returns_dataframe(self):
        raw_df = extract_data()
        cleaned_df = transform_data(raw_df)
        self.assertIsInstance(cleaned_df, pd.DataFrame)

    def test_transform_data_valid_structure(self):
        raw_df = extract_data()
        cleaned_df = transform_data(raw_df)
        self.assertGreater(len(cleaned_df), 0, "Data hasil transformasi tidak boleh kosong")
        self.assertIn('price', cleaned_df.columns)
        self.assertTrue(cleaned_df['price'].dtype in [float, int])


Writing tests/test_transform.py


In [73]:
%%writefile tests/test_load.py
import unittest
import pandas as pd
import os
from utils.load import load_to_csv

class TestLoad(unittest.TestCase):
    def setUp(self):
        self.dummy_df = pd.DataFrame({
            'title': ['Test Product'],
            'price': [100000.0],
            'rating': [4.5],
            'color': [1],
            'size': ['M'],
            'gender': ['Unisex'],
            'timestamp': [pd.Timestamp.now()]
        })
        self.output_path = "test_output.csv"

    def test_load_to_csv_success(self):
        result = load_to_csv(self.dummy_df, self.output_path)
        self.assertTrue(result)
        self.assertTrue(os.path.exists(self.output_path))

    def tearDown(self):
        if os.path.exists(self.output_path):
            os.remove(self.output_path)


Writing tests/test_load.py


In [74]:
!python3 -m unittest discover -s tests


..Data berhasil disimpan di: test_output.csv
.
----------------------------------------------------------------------
Ran 5 tests in 224.650s

OK


In [75]:
!pip install pytest coverage




In [76]:
!coverage run -m unittest discover -s tests


..Data berhasil disimpan di: test_output.csv
.
----------------------------------------------------------------------
Ran 5 tests in 231.456s

OK


In [77]:
!coverage report


Name                      Stmts   Miss  Cover
---------------------------------------------
tests/test_extract.py        10      0   100%
tests/test_load.py           15      0   100%
tests/test_transform.py      15      0   100%
utils/__init__.py             0      0   100%
utils/extract.py             41     11    73%
utils/load.py                12      5    58%
utils/transform.py           23      4    83%
---------------------------------------------
TOTAL                       116     20    83%


In [78]:
%%writefile submission.txt
1. Pastikan semua skrip pengujian sudah diupload ke Google Colab, termasuk:

> tests/test_extract.py

> tests/test_transform.py

> tests/test_load.py

2. Untuk menjalankan unit test menggunakan unittest, jalankan perintah berikut di Google Colab:

!python3 -m unittest discover -s tests

3. Perintah ini akan mencari semua file dengan awalan test_ di dalam folder tests/ dan menjalankan unit test yang ada.

4. Jika Anda ingin menjalankan file test secara spesifik, misalnya test_extract.py, jalankan:
> !python3 tests/test_extract.py
> !python3 tests/test_transform.py
> !python3 tests/test_load.py

5. Cara Menjalankan Coverage
!pip install pytest coverage
!coverage run -m unittest discover -s tests
!coverage report


Writing submission.txt


In [79]:
!pip freeze requirements.txt

absl-py==1.4.0
accelerate==1.6.0
aiohappyeyeballs==2.6.1
aiohttp==3.11.15
aiosignal==1.3.2
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.6
ale-py==0.11.0
altair==5.5.0
annotated-types==0.7.0
anyio==4.9.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.7.2
arviz==0.21.0
astropy==7.0.1
astropy-iers-data==0.2025.4.28.0.37.27
astunparse==1.6.3
atpublic==5.1
attrs==25.3.0
audioread==3.0.1
autograd==1.7.0
babel==2.17.0
backcall==0.2.0
backports.tarfile==1.2.0
beautifulsoup4==4.13.4
betterproto==2.0.0b6
bigframes==2.1.0
bigquery-magics==0.9.0
bleach==6.2.0
blinker==1.9.0
blis==1.3.0
blosc2==3.3.1
bokeh==3.7.2
Bottleneck==1.4.2
bqplot==0.12.44
branca==0.8.1
build==1.2.2.post1
CacheControl==0.14.2
cachetools==5.5.2
catalogue==2.0.10
certifi==2025.4.26
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.1
chex==0.1.89
clarabel==0.10.0
click==8.1.8
cloudpathlib==0.21.0
cloudpickle==3.1.1
cmake==3.31.6
cmdstanpy==1.2.5
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
commun

In [80]:
!pip freeze > requirements.txt
