In [1]:
def read_csv(filename):
    #reads a CSV file and returns a list of lists (rows of data)
    data = []

    with open(filename, "r") as file:
        lines = file.readlines()

    #process each row manually
    for line in lines:
        row = line.strip().split(",")  
        data.append(row) 

    return data

In [3]:
filename = "/Users/revan/Downloads/new_nfldata_2.csv"

In [5]:
#load data
data = read_csv(filename)

In [7]:
#seperate column names and data
headers = data[0]
data = data[1:]

In [9]:
useful_features = ['targets', 'receptions', 'receiving_yards', 'receiving_air_yards',
'yards_after_catch', 'reception_td', 'reception_fumble_lost', 'PositionEncoded']

In [11]:
feature_indices = [headers.index(col) for col in useful_features]

In [13]:
filtered_data = [[row[i] for i in feature_indices] for row in data]

In [15]:
for i in range(len(filtered_data)):
    for j in range(len(filtered_data[i])):
        val = filtered_data[i][j]
        if val.lstrip('-').replace('.', '', 1).isdigit():  #handles negatives
            filtered_data[i][j] = float(val)

In [17]:
X = [row[:-1] for row in filtered_data]
y = [row[-1] for row in filtered_data]

In [19]:
for i in range(len(X)):
    X[i].append(X[i][2] / (X[i][0] + 1))  #yards per target
    X[i].append(X[i][2] / (X[i][1] + 1))  #yards per reception
    X[i].append(X[i][5] / (X[i][1] + 1))  #TDs per reception
    X[i].append(X[i][0] * X[i][1])        #targets * receptions
    X[i].append(X[i][2] * X[i][5])        #yards * TDs

In [21]:
def robust_scaling(matrix): #Scales data using median and interquartile range (IQR)
    for col in range(len(matrix[0])):
        col_values = [matrix[row][col] for row in range(len(matrix))]
        col_values.sort()
        
        median = col_values[len(col_values) // 2]
        q1 = col_values[len(col_values) // 4]
        q3 = col_values[3 * len(col_values) // 4]
        iqr = q3 - q1

        #apply robust scaling
        for row in range(len(matrix)):
            matrix[row][col] = (matrix[row][col] - median) / iqr if iqr != 0 else 0

robust_scaling(X) 

In [23]:
def z_score_scaling(matrix): #standardizes data using Z-score normalization
    for col in range(len(matrix[0])):
        col_values = [matrix[row][col] for row in range(len(matrix))]
        mean = sum(col_values) / len(col_values)
        std_dev = (sum((x - mean) ** 2 for x in col_values) / len(col_values)) ** 0.5

        # Apply scaling
        for row in range(len(matrix)):
            matrix[row][col] = (matrix[row][col] - mean) / std_dev if std_dev != 0 else 0

z_score_scaling(X)  #apply Standardization

In [25]:
#adds a leading 1 to each row for bias term
for row in X:    
    row.insert(0, 1)

In [27]:
def exp_approx(x, terms=20): #use taylor series to approx e^x
    result, factorial, power = 1, 1, 1
    for i in range(1, terms):
        factorial *= i
        power *= x
        result += power / factorial
    return result

#sigmoid function
def sigmoid(z):
    if z >= 0:
        return 1 / (1 + exp_approx(-z))
    else:
        exp_z = exp_approx(z)
        return exp_z / (1 + exp_z)

In [29]:
#fisher-yates shuffle
def manual_shuffle(X, y):
    for i in range(len(X) - 1, 0, -1):
        j = (i * 13) % (i + 1)  # Change multiplication factor to improve randomness
        X[i], X[j] = X[j], X[i]
        y[i], y[j] = y[j], y[i]

In [31]:
#logistic regression
def logistic_regression(X, y, lr=0.0001, epochs=15000, lambda_=0.01):
    beta = [0.0] * len(X[0])
    prev_accuracy = 0 
    patience = 5 
    no_improve_count = 0
    lambda_l1 = 0.001  # L1 strength (Lasso)
    lambda_l2 = 0.001  # L2 strength (Ridge)

    for epoch in range(epochs):
        manual_shuffle(X, y)  

        for i in range(len(X)):  
            z = sum(X[i][j] * beta[j] for j in range(len(X[i])))
            pred = sigmoid(z)
            error = pred - y[i]

            for j in range(len(X[i])):
                beta[j] -= lr * (error * X[i][j] + lambda_l2 * beta[j] + lambda_l1 * (1 if beta[j] > 0 else -1))

        if epoch % 500 == 0:
            y_pred = predict(X, beta)
            current_accuracy = sum(1 for actual, predicted in zip(y, y_pred) if actual == predicted) / len(y)
            print(f"Epoch {epoch}: Accuracy = {current_accuracy:.4f}")

            # Check for improvement
            if abs(prev_accuracy - current_accuracy) < 0.0001:
                no_improve_count += 1
            else:
                no_improve_count = 0  #reset if improvement occurs
            
            if no_improve_count >= patience:
                print("Early stopping triggered.")
                break  #stop training
            
            prev_accuracy = current_accuracy  #update previous accuracy

    return beta

In [33]:
def predict(X, beta):
    return [1 if sigmoid(sum(X[i][j] * beta[j] for j in range(len(X[i])))) >= 0.5 else 0 for i in range(len(X))]

In [35]:
coefficients = logistic_regression(X, y)

Epoch 0: Accuracy = 0.7428
Epoch 500: Accuracy = 0.8979
Epoch 1000: Accuracy = 0.8990
Epoch 1500: Accuracy = 0.8990
Epoch 2000: Accuracy = 0.8987
Epoch 2500: Accuracy = 0.8982
Epoch 3000: Accuracy = 0.8979
Epoch 3500: Accuracy = 0.8982
Epoch 4000: Accuracy = 0.8982
Epoch 4500: Accuracy = 0.8982
Epoch 5000: Accuracy = 0.8982
Epoch 5500: Accuracy = 0.8982
Epoch 6000: Accuracy = 0.8982
Early stopping triggered.


In [37]:
y_pred = predict(X, coefficients)
accuracy = sum(1 for actual, predicted in zip(y, y_pred) if actual == predicted) / len(y)

In [39]:
print("\nFinal Accuracy:", accuracy)
#print("Final Coefficients:", coefficients)
#print("Predictions:", y_pred[:10])
#print("Actual Labels:", y[:10])  


Final Accuracy: 0.8981708981708981
