Mina imports

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.datasets import fetch_openml
import joblib

MNIST-datasetet från OpenML

In [None]:
mnist = fetch_openml('mnist_784', version=1)

X = mnist.data.to_numpy()
y = mnist.target.astype(int)

Normalisera datan och uppdelning är 70% träning och 15% validering samt 15 % test.

In [None]:
X = X / 255.0

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Skapar och tränar en Random Forest modell

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_val_predictions = rf_model.predict(X_val)
rf_val_accuracy = accuracy_score(y_val, rf_val_predictions)
print(f"Validation accuracy for Random Forest model: {rf_val_accuracy}")

rf_test_predictions = rf_model.predict(X_test)
rf_test_accuracy = accuracy_score(y_test, rf_test_predictions)
print(f"Test accuracy for Random Forest model: {rf_test_accuracy}")

Validation accuracy for Random Forest model: 0.9668571428571429
Test accuracy for Random Forest model: 0.9667619047619047


Skapar och tränar en Ligistic Regression modell

In [None]:
lr_model = LogisticRegression(max_iter=100, solver='lbfgs', multi_class='multinomial', random_state=42)
lr_model.fit(X_train, y_train)

lr_val_predictions = lr_model.predict(X_val)
lr_val_accuracy = accuracy_score(y_val, lr_val_predictions)
print(f"Validation accuracy for Logistic Regression model: {lr_val_accuracy}")

lr_test_predictions = lr_model.predict(X_test)
lr_test_accuracy = accuracy_score(y_test, lr_test_predictions)
print(f"Test accuracy for Logistic Regression model: {lr_test_accuracy}")



Validation accuracy for Logistic Regression model: 0.9245714285714286
Test accuracy for Logistic Regression model: 0.9194285714285715


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Skapar och tränar en MLP modell

In [None]:
mlp_model = MLPClassifier(hidden_layer_sizes=(256, 128, 64), activation='relu', solver='adam', max_iter=300, random_state=42)
mlp_model.fit(X_train, y_train)

mlp_val_predictions = mlp_model.predict(X_val)
mlp_val_accuracy = accuracy_score(y_val, mlp_val_predictions)
print(f"Validation accuracy for MLP model: {mlp_val_accuracy}")

mlp_test_predictions = mlp_model.predict(X_test)
mlp_test_accuracy = accuracy_score(y_test, mlp_test_predictions)
print(f"Test accuracy for MLP model: {mlp_test_accuracy}")

Validation accuracy for MLP model: 0.9795238095238096
Test accuracy for MLP model: 0.9798095238095238


Skapar och tränar en XGBoost modell som i sin tur fick jag omvandla data till DMatrix format som XGBoost förväntar sig samt skapar standardparametrar för modellen.

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softmax',
    'num_class': 10,
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'merror'
}

bst_model = xgb.train(params, dtrain, num_boost_round=10)

xgb_val_predictions = bst_model.predict(dval)
xgb_val_accuracy = accuracy_score(y_val, xgb_val_predictions)
print(f"Validation accuracy for XGBoost model: {xgb_val_accuracy}")

xgb_test_predictions = bst_model.predict(dtest)
xgb_test_accuracy = accuracy_score(y_test, xgb_test_predictions)
print(f"Test accuracy for XGBoost model: {xgb_test_accuracy}")

Validation accuracy for XGBoost model: 0.9414285714285714
Test accuracy for XGBoost model: 0.9431428571428572


Sammanfattning av resultaten

In [None]:
results = {
    'Random Forest (Validation)': rf_val_accuracy,
    'Logistic Regression (Validation)': lr_val_accuracy,
    'MLP (Validation)': mlp_val_accuracy,
    'XGBoost (Validation)': xgb_val_accuracy,
    'Random Forest (Test)': rf_test_accuracy,
    'Logistic Regression (Test)': lr_test_accuracy,
    'MLP (Test)': mlp_test_accuracy,
    'XGBoost (Test)': xgb_test_accuracy
}

print("\nModel Comparison (Validation and Test Accuracy):")
for model, accuracy in results.items():
    print(f"{model}: {accuracy:.4f}")



Model Comparison (Validation and Test Accuracy):
Random Forest (Validation): 0.9669
Logistic Regression (Validation): 0.9246
MLP (Validation): 0.9795
XGBoost (Validation): 0.9414
Random Forest (Test): 0.9668
Logistic Regression (Test): 0.9194
MLP (Test): 0.9798
XGBoost (Test): 0.9431


Här bestämmer koden vilken modell som presterat bäst baserat på accuracy (testnoggrannhet)

In [None]:
best_model_name = ''
best_model = None
best_accuracy = 0

if rf_test_accuracy > best_accuracy:
    best_accuracy = rf_test_accuracy
    best_model = rf_model
    best_model_name = 'Random Forest'
if lr_test_accuracy > best_accuracy:
    best_accuracy = lr_test_accuracy
    best_model = lr_model
    best_model_name = 'Logistic Regression'
if mlp_test_accuracy > best_accuracy:
    best_accuracy = mlp_test_accuracy
    best_model = mlp_model
    best_model_name = 'MLP'
if xgb_test_accuracy > best_accuracy:
    best_accuracy = xgb_test_accuracy
    best_model = bst_model
    best_model_name = 'XGBoost'

print(f"The best model is: {best_model_name} with a test accuracy of {best_accuracy:.4f}")

The best model is: MLP with a test accuracy of 0.9798


Sparar modellen samt testar att ladda ned den efter sparning och med hjälp av en dummy-input testar den för att se om den fungerar

In [None]:
joblib.dump(best_model, 'best_model1.pkl')

model = joblib.load('best_model1.pkl')

dummy_input = np.zeros((1, 784), dtype=np.float64)
try:
    print("Test prediction:", model.predict(dummy_input))
except Exception as e:
    print("Error:", e)

Test prediction: [5]
