# Task 1: Data Preprocessing for Machine Learning

## Library

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

## Storage

In [2]:
# Storage directory
RAW_DATA_DIR = "data/raw"
PROCESSED_DATA_DIR = "data/processed"
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

## Function

In [3]:
def preprocess_iris():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "iris.csv"))

    # Handle missing values hanya pada kolom numerik
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # Standardize numeric features
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Split data
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    train.to_csv(os.path.join(PROCESSED_DATA_DIR, "iris_train.csv"), index=False)
    test.to_csv(os.path.join(PROCESSED_DATA_DIR, "iris_test.csv"), index=False)
    print(f"Preprocessing completed for `iris.csv`. Files saved in {PROCESSED_DATA_DIR}")

In [4]:
def preprocess_stock_prices():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "Stock Prices Data Set.csv"))

    df['date'] = pd.to_datetime(df['date'])
    numeric_cols = ['open', 'high', 'low', 'close', 'volume']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Handle missing values
    df.fillna(df.mean(numeric_only=True), inplace=True)

    # Standardize numeric features
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Split data
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    train.to_csv(os.path.join(PROCESSED_DATA_DIR, "stock_prices_train.csv"), index=False)
    test.to_csv(os.path.join(PROCESSED_DATA_DIR, "stock_prices_test.csv"), index=False)
    print(f"Preprocessing completed for `Stock Prices Data Set.csv`. Files saved in {PROCESSED_DATA_DIR}")

In [5]:
def preprocess_sentiment():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "Sentiment dataset.csv"))
    df.drop(columns=['Unnamed: 0'], inplace=True)

    df['Timestamp'] = pd.to_datetime(df['Timestamp'])

    # Handle missing values
    df.fillna("Unknown", inplace=True)

    # Split data
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    train.to_csv(os.path.join(PROCESSED_DATA_DIR, "sentiment_train.csv"), index=False)
    test.to_csv(os.path.join(PROCESSED_DATA_DIR, "sentiment_test.csv"), index=False)
    print(f"Preprocessing completed for `Sentiment dataset.csv`. Files saved in {PROCESSED_DATA_DIR}")

In [6]:
def preprocess_house():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "house Prediction Data Set.csv"), delim_whitespace=True, header=None)

    # Handle missing values
    df.fillna(df.median(), inplace=True)

    # Standardize numeric features
    scaler = StandardScaler()
    df[df.columns] = scaler.fit_transform(df[df.columns])

    # Split data
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    train.to_csv(os.path.join(PROCESSED_DATA_DIR, "house_train.csv"), index=False)
    test.to_csv(os.path.join(PROCESSED_DATA_DIR, "house_test.csv"), index=False)
    print(f"Preprocessing completed for `house Prediction Data Set.csv`. Files saved in {PROCESSED_DATA_DIR}")

In [7]:
def preprocess_churn(filename, output_filename):
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, filename))

    # Debugging: Ensure columns are as expected
    print("Columns in the dataset:", df.columns.tolist())

    # Handle missing values only in numerical columns
    df.fillna(df.select_dtypes(include=['int64', 'float64']).median(), inplace=True)

    # Clear column names from hidden spaces
    df.columns = df.columns.str.strip()

    categorical_features = ['State', 'International plan', 'Voice mail plan']
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Ensure 'Churn' exists before dropping
    if 'Churn' in numerical_features:
        numerical_features.remove('Churn')

    if 'Churn' not in df.columns:
        raise KeyError("The column 'Churn' is not found in the dataset!")

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

    X = df.drop(columns=['Churn'])
    y = df['Churn']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply transformations
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Save processed data
    pd.DataFrame(X_train_processed).to_csv(os.path.join(PROCESSED_DATA_DIR, f"{output_filename}_X_train.csv"), index=False)
    pd.DataFrame(X_test_processed).to_csv(os.path.join(PROCESSED_DATA_DIR, f"{output_filename}_X_test.csv"), index=False)
    pd.DataFrame(y_train).to_csv(os.path.join(PROCESSED_DATA_DIR, f"{output_filename}_y_train.csv"), index=False)
    pd.DataFrame(y_test).to_csv(os.path.join(PROCESSED_DATA_DIR, f"{output_filename}_y_test.csv"), index=False)

    print(f"Preprocessing completed for {filename}. Files saved in {PROCESSED_DATA_DIR}")

# Code Execution

In [8]:
# Code Execution
preprocess_iris()
preprocess_stock_prices()
preprocess_sentiment()
preprocess_house()
preprocess_churn("churn-bigml-20.csv", "churn_20_processed")
preprocess_churn("churn-bigml-80.csv", "churn_80_processed")

print("✅ Preprocessing is complete. Files saved in data/processed.")

Preprocessing completed for `iris.csv`. Files saved in data/processed
Preprocessing completed for `Stock Prices Data Set.csv`. Files saved in data/processed
Preprocessing completed for `Sentiment dataset.csv`. Files saved in data/processed


  df = pd.read_csv(os.path.join(RAW_DATA_DIR, "house Prediction Data Set.csv"), delim_whitespace=True, header=None)


Preprocessing completed for `house Prediction Data Set.csv`. Files saved in data/processed
Columns in the dataset: ['State', 'Account length', 'Area code', 'International plan', 'Voice mail plan', 'Number vmail messages', 'Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes', 'Total eve calls', 'Total eve charge', 'Total night minutes', 'Total night calls', 'Total night charge', 'Total intl minutes', 'Total intl calls', 'Total intl charge', 'Customer service calls', 'Churn']
Preprocessing completed for churn-bigml-20.csv. Files saved in data/processed
Columns in the dataset: ['State', 'Account length', 'Area code', 'International plan', 'Voice mail plan', 'Number vmail messages', 'Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes', 'Total eve calls', 'Total eve charge', 'Total night minutes', 'Total night calls', 'Total night charge', 'Total intl minutes', 'Total intl calls', 'Total intl charge', 'Customer service calls', 'Churn']
P