<a href="https://colab.research.google.com/github/Prdazk/collabku/blob/main/Uts_DataScience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Upload file
from google.colab import files
uploaded = files.upload()

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel, RFE
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('train_public.csv')
print(f"Dataset shape: {df.shape}")

Saving train_public.csv to train_public.csv
Dataset shape: (313, 28)


In [2]:
# Pisahkan features dan target
X = df.drop('label', axis=1)
y = df['label']

# Feature Engineering - tambahkan features baru berdasarkan domain knowledge
def create_features(df):
    df_eng = df.copy()

    # Interaction features
    if 'pulse' in df.columns and 'respiratory_rate' in df.columns:
        df_eng['pulse_resp_ratio'] = df['pulse'] / (df['respiratory_rate'] + 1)

    if 'packed_cell_volume' in df.columns and 'total_protein' in df.columns:
        df_eng['pcv_tp_ratio'] = df['packed_cell_volume'] / (df['total_protein'] + 1)

    # Age grouping
    if 'age' in df.columns:
        df_eng['age_group'] = pd.cut(df['age'], bins=[0, 5, 10, 15, 20], labels=[0, 1, 2, 3])

    # Temperature status
    if 'rectal_temperature' in df.columns:
        df_eng['temp_status'] = np.where(df['rectal_temperature'] > 38.5, 1,
                                       np.where(df['rectal_temperature'] < 37.5, -1, 0))

    # Pulse status
    if 'pulse' in df.columns:
        df_eng['pulse_status'] = np.where(df['pulse'] > 80, 1,
                                        np.where(df['pulse'] < 40, -1, 0))

    return df_eng

X_eng = create_features(X)
print(f"Shape setelah feature engineering: {X_eng.shape}")

Shape setelah feature engineering: (313, 32)


In [3]:
# Identifikasi kolom
numerical_features = X_eng.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_eng.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Hapus kolom dengan terlalu banyak missing values
missing_percent = X_eng.isnull().sum() / len(X_eng) * 100
cols_to_drop = missing_percent[missing_percent > 60].index.tolist()
print(f"Kolom yang dihapus: {cols_to_drop}")

X_final = X_eng.drop(columns=cols_to_drop)
numerical_features = [col for col in numerical_features if col not in cols_to_drop]

# Advanced preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=10, weights='distance')),
    ('scaler', RobustScaler()),  # Lebih robust terhadap outliers
    ('transformer', PowerTransformer(method='yeo-johnson'))  # Handle skewness
])

if categorical_features:
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    ])
else:
    categorical_transformer = 'drop'

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Numerical features: 31
Categorical features: 0
Kolom yang dihapus: ['nasogastric_reflux_ph', 'abdominocentesis_total_protein']


In [4]:
# Split data dengan stratifikasi
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.15, random_state=42, stratify=y  # Kurangi test size
)

print(f"Training: {X_train.shape}, Test: {X_test.shape}")

# Define advanced models
models = {
    'random_forest': RandomForestClassifier(
        n_estimators=300,
        max_depth=20,
        min_samples_split=3,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    ),
    'gradient_boosting': GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=8,
        min_samples_split=5,
        min_samples_leaf=2,
        subsample=0.8,
        random_state=42
    ),
    'extra_trees': ExtraTreesClassifier(
        n_estimators=300,
        max_depth=20,
        min_samples_split=3,
        min_samples_leaf=1,
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    )
}

# Training dengan cross-validation
print("=== ADVANCED MODEL TRAINING ===")

best_model = None
best_accuracy = 0
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\n--- Training {name} ---")

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_final, y, cv=cv, scoring='accuracy', n_jobs=-1)

    # Train on full training set
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Test Accuracy: {test_accuracy:.4f}")

    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_model = pipeline
        best_model_name = name

print(f"\nBest single model: {best_model_name} - {best_accuracy:.4f}")

Training: (266, 30), Test: (47, 30)
=== ADVANCED MODEL TRAINING ===

--- Training random_forest ---
CV Accuracy: 0.8720 (+/- 0.1066)
Test Accuracy: 0.8298

--- Training gradient_boosting ---
CV Accuracy: 0.8561 (+/- 0.1027)
Test Accuracy: 0.8085

--- Training extra_trees ---
CV Accuracy: 0.8656 (+/- 0.0814)
Test Accuracy: 0.8511

Best single model: extra_trees - 0.8511


In [5]:
# Buat Voting Classifier dengan model terbaik
print("\n=== BUILDING VOTING ENSEMBLE ===")

# Definisikan base models untuk voting
rf = RandomForestClassifier(
    n_estimators=300, max_depth=20, min_samples_split=3,
    min_samples_leaf=1, random_state=42, n_jobs=-1
)

gb = GradientBoostingClassifier(
    n_estimators=200, learning_rate=0.05, max_depth=8,
    min_samples_split=5, random_state=42
)

et = ExtraTreesClassifier(
    n_estimators=300, max_depth=20, min_samples_split=3,
    random_state=42, n_jobs=-1
)

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('gb', gb),
        ('et', et)
    ],
    voting='soft',  # Soft voting biasanya lebih baik
    n_jobs=-1
)

# Pipeline dengan voting classifier
voting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', voting_clf)
])

# Train voting classifier
print("Training Voting Ensemble...")
voting_pipeline.fit(X_train, y_train)

# Evaluate
y_pred_voting = voting_pipeline.predict(X_test)
voting_accuracy = accuracy_score(y_test, y_pred_voting)

print(f"Voting Ensemble Test Accuracy: {voting_accuracy:.4f}")

# Pilih model terbaik antara single dan ensemble
if voting_accuracy > best_accuracy:
    best_model = voting_pipeline
    best_accuracy = voting_accuracy
    print("✅ Voting Ensemble dipilih sebagai model terbaik")
else:
    print("✅ Single model dipilih sebagai model terbaik")


=== BUILDING VOTING ENSEMBLE ===
Training Voting Ensemble...
Voting Ensemble Test Accuracy: 0.7872
✅ Single model dipilih sebagai model terbaik


In [6]:
# Feature selection untuk meningkatkan performance
print("\n=== FEATURE SELECTION ===")

# Gunakan Random Forest untuk feature selection
feature_selector = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold='median'
)

# Pipeline dengan feature selection
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', feature_selector),
    ('classifier', best_model.named_steps['classifier'])
])

# Train final model
final_pipeline.fit(X_train, y_train)
y_pred_final = final_pipeline.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_final)

print(f"Final Model Accuracy: {final_accuracy:.4f}")

# Pilih pipeline terbaik
if final_accuracy > best_accuracy:
    best_model = final_pipeline
    best_accuracy = final_accuracy
    print("✅ Model dengan feature selection lebih baik")


=== FEATURE SELECTION ===
Final Model Accuracy: 0.8936
✅ Model dengan feature selection lebih baik


In [7]:
# Evaluasi final
print("\n=== FINAL EVALUATION ===")

# Cross-validation pada full dataset
final_cv_scores = cross_val_score(best_model, X_final, y, cv=5, scoring='accuracy')
print(f"Final CV Accuracy: {final_cv_scores.mean():.4f} (+/- {final_cv_scores.std() * 2:.4f})")

# Detailed evaluation
y_pred_proba = best_model.predict_proba(X_test)

print(f"\nFinal Test Accuracy: {best_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_final))

# Confidence scores
confidence_scores = np.max(y_pred_proba, axis=1)
print(f"Average confidence: {np.mean(confidence_scores):.4f}")
print(f"Min confidence: {np.min(confidence_scores):.4f}")


=== FINAL EVALUATION ===
Final CV Accuracy: 0.8784 (+/- 0.0604)

Final Test Accuracy: 0.8936

Classification Report:
              precision    recall  f1-score   support

           1       0.87      0.96      0.92        28
           2       0.94      0.79      0.86        19

    accuracy                           0.89        47
   macro avg       0.90      0.88      0.89        47
weighted avg       0.90      0.89      0.89        47

Average confidence: 0.8002
Min confidence: 0.5858


In [8]:
# Simpan model terbaik
joblib.dump(best_model, 'model.pkl')

# Verifikasi
loaded_model = joblib.load('model.pkl')
verify_pred = loaded_model.predict(X_test[:3])

print(f"\n✅ Model berhasil disimpan!")
print(f"✅ Final Accuracy: {best_accuracy:.4f}")
print(f"✅ Sample predictions: {verify_pred}")
print(f"✅ Actual values: {y_test[:3].values}")

# Download model
files.download('model.pkl')
print("📥 Model downloaded sebagai 'model.pkl'")


✅ Model berhasil disimpan!
✅ Final Accuracy: 0.8936
✅ Sample predictions: [1 2 1]
✅ Actual values: [1 2 1]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Model downloaded sebagai 'model.pkl'


In [9]:
# Strategy 1: Stacking Classifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Stacking classifier
estimators = [
    ('rf', RandomForestClassifier(n_estimators=300, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=200, random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=300, random_state=42))
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(C=0.1, random_state=42),
    cv=5
)

stacking_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', stacking_clf)
])

stacking_pipeline.fit(X_train, y_train)
stacking_accuracy = accuracy_score(y_test, stacking_pipeline.predict(X_test))
print(f"Stacking Classifier Accuracy: {stacking_accuracy:.4f}")

if stacking_accuracy > best_accuracy:
    best_model = stacking_pipeline
    best_accuracy = stacking_accuracy
    joblib.dump(best_model, 'model.pkl')
    files.download('model.pkl')

Stacking Classifier Accuracy: 0.8298
