# Task 1: Data Preprocessing for Machine Learning

## Library

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
RAW_DATA_DIR = "data/raw"
PROCESSED_DATA_DIR = "data/processed"
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

In [4]:
def preprocess_iris():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "iris.csv"))
    df.to_csv(os.path.join(PROCESSED_DATA_DIR, "iris_processed.csv"), index=False)

In [5]:
def preprocess_stock_prices():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "Stock Prices Data Set.csv"))
    df['date'] = pd.to_datetime(df['date'])
    df[['open', 'high', 'low', 'close', 'volume']] = df[['open', 'high', 'low', 'close', 'volume']].astype(float)
    df.to_csv(os.path.join(PROCESSED_DATA_DIR, "stock_prices_processed.csv"), index=False)

In [6]:
def preprocess_sentiment():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "Sentiment dataset.csv"))
    df = df.drop(columns=['Unnamed: 0'])
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df.to_csv(os.path.join(PROCESSED_DATA_DIR, "sentiment_processed.csv"), index=False)

In [7]:
def preprocess_house():
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, "house Prediction Data Set.csv"), delim_whitespace=True, header=None)
    df.to_csv(os.path.join(PROCESSED_DATA_DIR, "house_processed.csv"), index=False)

In [10]:
def preprocess_churn(filename, output_filename):
    df = pd.read_csv(os.path.join(RAW_DATA_DIR, filename))

    # Debugging: Pastikan kolom sesuai ekspektasi
    print("Kolom dalam dataset:", df.columns.tolist())

    # Bersihkan nama kolom dari spasi tersembunyi
    df.columns = df.columns.str.strip()

    categorical_features = ['State', 'International plan', 'Voice mail plan']
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

    # Pastikan 'Churn' ada sebelum drop
    if 'Churn' in numerical_features:
        numerical_features = numerical_features.drop('Churn')

    if 'Churn' not in df.columns:
        raise KeyError("Kolom 'Churn' tidak ditemukan dalam dataset!")

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

    X = df.drop(columns=['Churn'])
    y = df['Churn']

    # Langsung transformasi tanpa split ulang
    X_processed = preprocessor.fit_transform(X)

    pd.DataFrame(X_processed).to_csv(os.path.join(PROCESSED_DATA_DIR, f"{output_filename}_X.csv"), index=False)
    pd.DataFrame(y).to_csv(os.path.join(PROCESSED_DATA_DIR, f"{output_filename}_y.csv"), index=False)

    print(f"Preprocessing selesai untuk {filename}. File disimpan di {PROCESSED_DATA_DIR}")

In [None]:
preprocess_iris()
preprocess_stock_prices()
preprocess_sentiment()
preprocess_house()
preprocess_churn("churn-bigml-20.csv", "churn_20_processed")
preprocess_churn("churn-bigml-80.csv", "churn_80_processed")

print("Preprocessing is complete. File saved in data/processed")

  df = pd.read_csv(os.path.join(RAW_DATA_DIR, "house Prediction Data Set.csv"), delim_whitespace=True, header=None)


Kolom dalam dataset: ['State', 'Account length', 'Area code', 'International plan', 'Voice mail plan', 'Number vmail messages', 'Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes', 'Total eve calls', 'Total eve charge', 'Total night minutes', 'Total night calls', 'Total night charge', 'Total intl minutes', 'Total intl calls', 'Total intl charge', 'Customer service calls', 'Churn']
Preprocessing selesai untuk churn-bigml-20.csv. File disimpan di data/processed
Kolom dalam dataset: ['State', 'Account length', 'Area code', 'International plan', 'Voice mail plan', 'Number vmail messages', 'Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes', 'Total eve calls', 'Total eve charge', 'Total night minutes', 'Total night calls', 'Total night charge', 'Total intl minutes', 'Total intl calls', 'Total intl charge', 'Customer service calls', 'Churn']
Preprocessing selesai untuk churn-bigml-80.csv. File disimpan di data/processed
Preprocessing is