In [1]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

In [14]:
df_my_data = pd.read_csv("./student-graduation/graduation_dataset_processed.csv")
df_head = df_my_data.head(800)

In [15]:
# ðŸ“¦ Modellvalg og hyperparameter-grid
def get_model_and_params(model_type):
    if model_type == "random_forest":
        model = RandomForestClassifier()
        param_grid = {
            'model__n_estimators': [50, 100],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
    elif model_type == "svm":
        model = SVC()
        param_grid = {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf'],
            'model__gamma': ['scale', 'auto']
        }
    elif model_type == "xgboost":
        model = XGBClassifier(eval_metric='mlogloss')
        param_grid = {
            'model__n_estimators': [50, 100],
            'model__max_depth': [3, 6],
            'model__learning_rate': [0.01, 0.1]
        }
    else:
        raise ValueError("Ukjent modelltype")
    
    return model, param_grid

# ðŸ§ª Pipeline-funksjon
def run_pipeline(data: pd.DataFrame, target_column: str, model_type: str):
    X = data.drop(columns=[target_column])
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model, param_grid = get_model_and_params(model_type)

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print("âœ… Beste parametere:", grid_search.best_params_)
    print("ðŸ“Š Test accuracy:", acc)
    return best_model


In [16]:
# Eksempeldata
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

# KjÃ¸r pipeline med SVM
model = run_pipeline(df_head, target_column='Target', model_type='random_forest')


âœ… Beste parametere: {'model__max_depth': 20, 'model__min_samples_split': 2, 'model__n_estimators': 100}
ðŸ“Š Test accuracy: 0.73125
