# Task 1: Data Preprocessing for Machine Learning

## Library

In [10]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Storage

In [11]:
# Storage directory
RAW_DATA_DIR = "data/raw"
PROCESSED_DATA_DIR = "data/processed"
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

## Function

In [12]:
def preprocess_iris():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "iris.csv"))

    # Handle missing values hanya pada kolom numerik
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # Standardize numeric features
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Split data
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    train.to_csv(os.path.join(PROCESSED_DATA_DIR, "iris_train.csv"), index=False)
    test.to_csv(os.path.join(PROCESSED_DATA_DIR, "iris_test.csv"), index=False)
    print(f"Preprocessing completed for `iris.csv`. Files saved in {PROCESSED_DATA_DIR}")

In [13]:
def preprocess_stock_prices():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "Stock Prices Data Set.csv"))

    df['date'] = pd.to_datetime(df['date'])
    numeric_cols = ['open', 'high', 'low', 'close', 'volume']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Handle missing values
    df.fillna(df.mean(numeric_only=True), inplace=True)

    # Standardize numeric features
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Split data
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    train.to_csv(os.path.join(PROCESSED_DATA_DIR, "stock_prices_train.csv"), index=False)
    test.to_csv(os.path.join(PROCESSED_DATA_DIR, "stock_prices_test.csv"), index=False)
    print(f"Preprocessing completed for `Stock Prices Data Set.csv`. Files saved in {PROCESSED_DATA_DIR}")

In [14]:
def preprocess_sentiment():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "Sentiment dataset.csv"))
    df.drop(columns=['Unnamed: 0'], inplace=True)

    df['Timestamp'] = pd.to_datetime(df['Timestamp'])

    # Handle missing values
    df.fillna("Unknown", inplace=True)

    # Split data
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    train.to_csv(os.path.join(PROCESSED_DATA_DIR, "sentiment_train.csv"), index=False)
    test.to_csv(os.path.join(PROCESSED_DATA_DIR, "sentiment_test.csv"), index=False)
    print(f"Preprocessing completed for `Sentiment dataset.csv`. Files saved in {PROCESSED_DATA_DIR}")

In [15]:
def preprocess_house():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "house Prediction Data Set.csv"), delim_whitespace=True, header=None)

    # Handle missing values
    df.fillna(df.median(), inplace=True)

    # Standardize numeric features
    scaler = StandardScaler()
    df[df.columns] = scaler.fit_transform(df[df.columns])

    # Split data
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    train.to_csv(os.path.join(PROCESSED_DATA_DIR, "house_train.csv"), index=False)
    test.to_csv(os.path.join(PROCESSED_DATA_DIR, "house_test.csv"), index=False)
    print(f"Preprocessing completed for `house Prediction Data Set.csv`. Files saved in {PROCESSED_DATA_DIR}")

# Code Execution

In [16]:
# Code Execution
preprocess_iris()
preprocess_stock_prices()
preprocess_sentiment()
preprocess_house()

print("✅ Preprocessing is complete. Files saved in data/processed.")

Preprocessing completed for `iris.csv`. Files saved in data/processed
Preprocessing completed for `Stock Prices Data Set.csv`. Files saved in data/processed
Preprocessing completed for `Sentiment dataset.csv`. Files saved in data/processed
Preprocessing completed for `house Prediction Data Set.csv`. Files saved in data/processed
✅ Preprocessing is complete. Files saved in data/processed.


  df = pd.read_csv(os.path.join(RAW_DATA_DIR, "house Prediction Data Set.csv"), delim_whitespace=True, header=None)
