In [7]:
import numpy as np
from ucimlrepo import fetch_ucirepo
from logistic_regression import LogisticRegression

In [18]:
rice = fetch_ucirepo(id=545)
dataset = rice["data"]["original"].to_numpy()

num_samples = dataset.shape[0]
num_features = dataset.shape[1] - 1
num_classes = len(np.unique(dataset[:,-1]))

print(f"Number of samples: {num_samples}") # 3810
print(f"Number of features: {num_features}") # 7
print(f"Number of classes: {num_classes}") # 2

mins = [None] * num_features
maxes = [None] * num_features

Number of samples: 3810
Number of features: 7
Number of classes: 2


In [19]:
def clip_feature(data, feature_index, min_val, max_val):
    for i in range(len(data)):
        if data[i][feature_index] < min_val:
            data[i][feature_index] = min_val
        elif data[i][feature_index] > max_val:
            data[i][feature_index] = max_val

In [20]:
def scale_features(data):
    for i in range(num_features):
        min_val = data.T[i].min()
        max_val = data.T[i].max()
        for d in range(N):
            data[d][i] = (data[d][i] - min_val) / (max_val - min_val)

        mins[i] = min_val
        maxes[i] = max_val


In [21]:
np.random.shuffle(dataset)

In [30]:
def split_data(data, ratio):
    split = int(ratio * len(data))
    return data[:split], data[split:]

(3048, 8)
(762, 8)


In [26]:
def k_fold_cross_validation(X, y, C_values, k=5):
    fold_size = num_samples // k

    # Shuffle indices
    indices = np.random.permutation(num_samples)

    # Initialize arrays to store scores for each value of C
    mean_scores = np.zeros(len(C_values))

    for i, C in enumerate(C_values):
        scores = []

        for j in range(k):
            # Split data into training and validation folds
            val_indices = indices[j * fold_size: (j + 1) * fold_size]
            train_indices = np.concatenate([indices[:j * fold_size], indices[(j + 1) * fold_size:]])

            X_train, X_val = X[train_indices], X[val_indices]
            y_train, y_val = y[train_indices], y[val_indices]

            # Train logistic regression model
            theta = np.zeros(X_train.shape[1])
            for _ in range(100):  # 100 iterations for simplicity
                predictions = LogisticRegression(X_train, y_train, theta)
                gradient = np.dot(X_train.T, (predictions - y_train)) / len(y_train)
                theta -= C * gradient

            # Predict on validation fold
            val_predictions = (LogisticRegression(X_val, y_val, theta) >= 0.5).astype(int)

            # Calculate accuracy
            score = (np.mean(y_val == val_predictions)) 
            scores.append(score)

        # Calculate mean score for current value of C
        mean_scores[i] = np.mean(scores)

    # Select optimal value of C
    optimal_C_index = np.argmax(mean_scores)
    optimal_C = C_values[optimal_C_index]

    return optimal_C

# Define candidate values for regularization parameter C
c_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

y = dataset[:,-1]
optimal_C = k_fold_cross_validation(dataset, y, c_values, 5)

TypeError: LogisticRegression.__init__() takes 2 positional arguments but 4 were given