In [4]:
# 1. Data Collection and Import
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the iris dataset as an example
data = datasets.load_iris()
X = data.data
y = data.target

# 3. Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
pipeline = Pipeline(
    [
        # 2. Data Preprocessing
        # 2.1 Handling missing values
        ('simple_imputer',SimpleImputer(strategy='mean')),

        # 2.2 Feature Engineering
        # No feature engineering

        # 2.3 Scaling/Normalization
        ('standard_scaler',StandardScaler()),

        # 4. Model Selection
        ('ourestimator',RandomForestClassifier())
    ]
)

# 5. Training
pipeline.fit(X_train, y_train)

# 6. Evaluation
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")


Accuracy: 100.00%


In [6]:
from sklearn.decomposition import PCA

# Create pipeline with PCA
pipeline_pca = Pipeline(
    [
        # 2. Data Preprocessing
        # 2.1 Handling missing values
        ('simple_imputer', SimpleImputer(strategy='mean')),

        # 2.3 Scaling/Normalization
        ('standard_scaler', StandardScaler()),

        # 2.4 Dimensionality Reduction
        ('pca', PCA(n_components=1)),

        # 4. Model Selection
        ('ourestimator', RandomForestClassifier())
    ]
)

# 5. Training
pipeline_pca.fit(X_train, y_train)

# 6. Evaluation
y_pred = pipeline_pca.predict(X_test)
accuracy_pca = accuracy_score(y_test, y_pred)
print(f"Accuracy with PCA: {accuracy_pca*100:.2f}%")

Accuracy with PCA: 86.67%
