In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Step 1: Initialize parameters
def initialize_parameters(num_features):
    weights = np.zeros((num_features, 1))
    bias = 0
    return weights, bias

# Step 2: Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Step 3: Compute predictions
def compute_predictions(X, weights, bias):
    z = np.dot(X, weights) + bias
    y_pred = sigmoid(z)
    return y_pred

# Step 4: Compute cost
def compute_cost(y_true, y_pred, epsilon=1e-15):
    """
    Compute the log loss (cost) for logistic regression.

    Parameters:
    - y_true: True labels (0 or 1)
    - y_pred: Predicted probabilities (between 0 and 1)
    - epsilon: Small value to avoid log(0) or log(1)

    Returns:
    - cost: Computed log loss
    """
    # Clip predictions to avoid log(0) or log(1)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

    # Compute the cost
    m = y_true.shape[0]  # Number of samples
    cost = -(1/m) * np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    return cost

# Step 5: Gradient descent
def gradient_descent(X, y_true, y_pred, weights, bias, learning_rate):
    m = y_true.shape[0]
    dw = (1/m) * np.dot(X.T, (y_pred - y_true))
    db = (1/m) * np.sum(y_pred - y_true)
    weights -= learning_rate * dw
    bias -= learning_rate * db
    return weights, bias

# Step 6: Train the model
def train_logistic_regression(X_train, y_train, learning_rate=0.01, num_iterations=1000):
    num_features = X_train.shape[1]
    weights, bias = initialize_parameters(num_features)
    y_train = y_train.values.reshape(-1, 1)

    for i in range(num_iterations):
        y_pred = compute_predictions(X_train, weights, bias)
        cost = compute_cost(y_train, y_pred)
        weights, bias = gradient_descent(X_train, y_train, y_pred, weights, bias, learning_rate)
        if i % 100 == 0:
            print(f"Iteration {i}, Cost: {cost}")

    return weights, bias

# Step 7: Make predictions
def predict(X, weights, bias):
    y_pred = compute_predictions(X, weights, bias)
    y_pred_class = (y_pred > 0.45).astype(int)
    return y_pred_class

# Load data and split into features and target
df = pd.read_csv("/content/drive/MyDrive/Dataset/filtered_climate_data_final.csv");
X = df.drop(columns = ["isRainy","MinTemp_2m","TempRange_2m","MaxWindSpeed_10m","MinWindSpeed_10m","WindSpeedRange_10m","WindSpeed_50m","MaxWindSpeed_50m","MinWindSpeed_50m","Precip","WindSpeedRange_50m"]); #dropping unnecessary features

y = df["isRainy"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
weights, bias = train_logistic_regression(X_train, y_train, learning_rate=0.001, num_iterations=2000)

# # Make predictions
# y_pred_test = predict(X_test, weights, bias)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred_test)

# Train accuracy
y_pred_train = predict(X_train, weights, bias)  # Predict on training set
train_accuracy = accuracy_score(y_train, y_pred_train) * 100

# Test accuracy
y_pred_test = predict(X_test, weights, bias)  # Predict on test set
test_accuracy = accuracy_score(y_test, y_pred_test) * 100

# Print results
print(f"Train Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")

print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))

print("\nConfusion Matrix (Test Data):")
print(confusion_matrix(y_test, y_pred_test))


Iteration 0, Cost: 0.6931471805599453
Iteration 100, Cost: 0.3091972820659265
Iteration 200, Cost: 0.30440004733873793
Iteration 300, Cost: 0.30317359038006847
Iteration 400, Cost: 0.30233536541770467
Iteration 500, Cost: 0.30160267998547524
Iteration 600, Cost: 0.30093642688194117
Iteration 700, Cost: 0.30032573130154755
Iteration 800, Cost: 0.29976386512216635
Iteration 900, Cost: 0.29924522692574224
Iteration 1000, Cost: 0.2987649023645948
Iteration 1100, Cost: 0.298318552592667
Iteration 1200, Cost: 0.2979023476860833
Iteration 1300, Cost: 0.29751290990436313
Iteration 1400, Cost: 0.2971472625975246
Iteration 1500, Cost: 0.29680278415677785
Iteration 1600, Cost: 0.2964771667048008
Iteration 1700, Cost: 0.2961683791939589
Iteration 1800, Cost: 0.29587463455454494
Iteration 1900, Cost: 0.29559436053004173
Train Accuracy: 87.58%
Test Accuracy: 88.13%

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.92      0.92      0.92  