# ML Exercise

### 1. Load the dataset and split into training and test set.

In [1]:
import mnist_reader

X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train')
X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k')

### 2. Train a Random Forest classifier on the dataset and time how long it takes, then evaluate the resulting model on the test set

In [2]:
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators= 100)
model.fit(X_train, y_train)


# train the model on the reduced dataset and time the process
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# measure the accuracy of the model on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# print the time taken to train the model and its accuracy
print("Time taken to train model: {:.2f} seconds".format(end_time - start_time))
print("Accuracy of the model on the test set: {:.2f}".format(accuracy))

Time taken to train model: 58.29 seconds
Accuracy of the model on the test set: 0.88


It takes 59 seconds to train this model.

In [3]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8769

### 3. Next, use PCA to reduce the dataset's dimensionality

In [4]:
from sklearn.decomposition import PCA

# create a PCA object with an explained variance ratio of 0.95
pca = PCA(n_components=0.95)

# fit the PCA object to the training set
pca.fit(X_train)

# transform the training and test sets
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)


### 4. Train a new Random Forest classifier on the reduced dataset and measure how long it takes.

In [5]:
import time 

# create a new Random Forest classifier with 100 trees
model = RandomForestClassifier(n_estimators=100)

# train the model on the reduced dataset and time the process
start_time = time.time()
model.fit(X_train_pca, y_train)
end_time = time.time()

# measure the accuracy of the model on the test set
y_pred = model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)

# print the time taken to train the model and its accuracy
print("Time taken to train model: {:.2f} seconds".format(end_time - start_time))
print("Accuracy of the model on the test set: {:.2f}".format(accuracy))


Time taken to train model: 69.93 seconds
Accuracy of the model on the test set: 0.85


### 5. Was training much faster?

### 6. Evaluate the classifier on the test set: how does it compare to the previous classifier?

### 7. Apply softmax regression (using the original dataset) and time how long it takes, then evaluate the resulting model on the test set.

In [8]:
from sklearn.linear_model import LogisticRegression
import time

# apply softmax regression and time how long it takes
start_time = time.time()
softmax = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=100)
softmax.fit(X_train, y_train)
end_time = time.time()
print("Time taken to train softmax regression model: {:.2f} seconds".format(end_time - start_time))

# evaluate the resulting model on the test set
y_pred_softmax = softmax.predict(X_test)
accuracy_softmax = accuracy_score(y_test, y_pred_softmax)
print("Softmax Regression: Test Set accuracy = {:.2f}".format(accuracy_softmax))


Time taken to train softmax regression model: 6.73 seconds
Softmax Regression: Test Set accuracy = 0.84


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 8. Apply softmax regression (using the reduced dataset) and time how long it takes, then evaluate the resulting model on the test set.

In [9]:
from sklearn.linear_model import LogisticRegression
import time

# original code to load data and perform PCA
X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train')
X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k')

pca = PCA(n_components=95)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

# apply softmax regression and time how long it takes
start_time = time.time()
softmax_reduced = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=100)
softmax_reduced.fit(X_train_reduced, y_train)
end_time = time.time()
print("Time taken to train reduced softmax regression model: {:.2f} seconds".format(end_time - start_time))

# evaluate the resulting model on the test set
y_pred_softmax_reduced = softmax_reduced.predict(X_test_reduced)
accuracy_softmax_reduced = accuracy_score(y_test, y_pred_softmax_reduced)
print("Reduced Softmax Regression: Test Set accuracy = {:.2f}".format(accuracy_softmax_reduced))


Time taken to train reduced softmax regression model: 3.06 seconds
Reduced Softmax Regression: Test Set accuracy = 0.84


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
