In [None]:
import numpy as np
import pandas as pd

# ==========================================
# 1. Helper Functions (Splitting & Scaling)
# ==========================================

def train_test_split_manual(X, y, test_size=0.2, random_state=42):
    """Splits data into training and testing sets."""
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    test_samples = int(len(X) * test_size)
    
    test_idx = indices[:test_samples]
    train_idx = indices[test_samples:]
    
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

def standardize_manual(X_train, X_test):
    """Standardizes features by removing the mean and scaling to unit variance."""
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    
    # Avoid division by zero
    std = np.where(std == 0, 1, std)
    
    X_train_scaled = (X_train - mean) / std
    X_test_scaled = (X_test - mean) / std
    
    return X_train_scaled, X_test_scaled

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

# ==========================================
# 2. PCA Implementation from Scratch
# ==========================================

class PCA_Scratch:
    def __init__(self, variance_threshold=0.95):
        self.variance_threshold = variance_threshold
        self.components = None
        self.mean = None
        self.n_components = None

    def fit(self, X):
        # 1. Mean centering
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean
        
        # 2. Covariance matrix
        cov = np.cov(X_centered.T)
        
        # 3. Eigenvalues and Eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        
        # 4. Sort eigenvectors by eigenvalues (descending)
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors.T[idxs]
        
        # 5. Determine number of components to keep
        total_variance = np.sum(eigenvalues)
        explained_variance_ratio = eigenvalues / total_variance
        cumulative_variance = np.cumsum(explained_variance_ratio)
        
        # Find index where cumulative variance >= threshold
        self.n_components = np.argmax(cumulative_variance >= self.variance_threshold) + 1
        
        # 6. Store first n_components
        self.components = eigenvectors[:self.n_components]
        
        print(f"PCA: Selected {self.n_components} components explaining {cumulative_variance[self.n_components-1]:.2%} variance.")

    def transform(self, X):
        X_centered = X - self.mean
        return np.dot(X_centered, self.components.T)

# ==========================================
# 3. SVM Implementation from Scratch
# ==========================================

class SVM_Scratch:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # Convert labels to {-1, 1}
        y_ = np.where(y <= 0, -1, 1)
        
        self.w = np.zeros(n_features)
        self.b = 0

        # Gradient Descent
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                # Hinge loss condition: y_i * (w.x_i - b) >= 1
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                
                if condition:
                    # Gradient if correctly classified (only regularization)
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    # Gradient if misclassified (regularization + loss)
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]

    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        # Convert sign back to {0, 1}
        return np.where(np.sign(approx) == -1, 0, 1)

# ==========================================
# 4. Logistic Regression from Scratch
# ==========================================

class LogisticRegression_Scratch:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            # Gradient calculation
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in y_predicted]

# ==========================================
# 5. Main Execution
# ==========================================

if __name__ == "__main__":
    # --- Data Loading & Preprocessing ---
    try:
        df = pd.read_csv(r'D:\OneDrive\Desktop\First Semester\MrM Research\Coding\Data\titanic.csv')
    except FileNotFoundError:
        print("Error: 'titanic.csv' not found. Please upload the file.")
        exit()

    # Handle missing values
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # Drop irrelevant columns
    df.drop(['Cabin', 'PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    
    # Encode categorical variables
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
    
    # Convert all columns to numeric (handling booleans from get_dummies)
    for col in df.columns:
        if df[col].dtype == 'bool':
            df[col] = df[col].astype(int)

    # Prepare features and target
    X = df.drop('Survived', axis=1).values
    y = df['Survived'].values

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split_manual(X, y, test_size=0.2, random_state=42)

    # Scale features
    X_train_scaled, X_test_scaled = standardize_manual(X_train, X_test)

    print(f"Data Loaded: {X.shape[0]} samples, {X.shape[1]} features")
    print("-" * 50)

    # --- Scenario A: Without PCA ---
    print("Running Models WITHOUT PCA...")
    
    # SVM
    svm = SVM_Scratch(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
    svm.fit(X_train_scaled, y_train)
    acc_svm = accuracy(y_test, svm.predict(X_test_scaled))
    
    # Logistic Regression
    lr = LogisticRegression_Scratch(learning_rate=0.01, n_iters=1000)
    lr.fit(X_train_scaled, y_train)
    acc_lr = accuracy(y_test, lr.predict(X_test_scaled))
    
    print(f"SVM Accuracy: {acc_svm:.4f}")
    print(f"Logistic Regression Accuracy: {acc_lr:.4f}")
    print("-" * 50)

    # --- Scenario B: With PCA ---
    print("Running Models WITH PCA...")
    
    # Apply PCA
    pca = PCA_Scratch(variance_threshold=0.95)
    pca.fit(X_train_scaled) # Fit only on train data
    X_train_pca = pca.transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    # SVM with PCA
    svm_pca = SVM_Scratch(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
    svm_pca.fit(X_train_pca, y_train)
    acc_svm_pca = accuracy(y_test, svm_pca.predict(X_test_pca))
    
    # Logistic Regression with PCA
    lr_pca = LogisticRegression_Scratch(learning_rate=0.01, n_iters=1000)
    lr_pca.fit(X_train_pca, y_train)
    acc_lr_pca = accuracy(y_test, lr_pca.predict(X_test_pca))
    
    print(f"SVM (PCA) Accuracy: {acc_svm_pca:.4f}")
    print(f"Logistic Regression (PCA) Accuracy: {acc_lr_pca:.4f}")
    print("-" * 50)

    # --- Final Comparison Table ---
    print("\nFinal Results Summary:")
    print(f"{'Model':<25} {'No PCA':<15} {'With PCA':<15}")
    print(f"{'Custom SVM':<25} {acc_svm:<15.4f} {acc_svm_pca:<15.4f}")
    print(f"{'Logistic Regression':<25} {acc_lr:<15.4f} {acc_lr_pca:<15.4f}")

  df = pd.read_csv('D:\OneDrive\Desktop\First Semester\MrM Research\Coding\Data\titanic.csv')
  df = pd.read_csv('D:\OneDrive\Desktop\First Semester\MrM Research\Coding\Data\titanic.csv')


OSError: [Errno 22] Invalid argument: 'D:\\OneDrive\\Desktop\\First Semester\\MrM Research\\Coding\\Data\titanic.csv'