# Naive Bayes implementation

## Water potability Dataset
We will use a Water Potability dataset
columns: 
- ph
- Hardness
- Solids
- Chloramines
- Sulfate
- Conductivity
- Organic_carbon 
- Trihalomethanes 
- Turbidity
- Potability (target)

## Coding Naive Bayes
We will start writing the prompt for chatGPT.  
`write naive bayes algorithm without using scikit learning`

In [None]:
import numpy as np

# Step 1: Load and preprocess your dataset
# Ensure you have X (features) and y (labels) prepared.

# Step 2: Split your dataset into training and testing sets
def train_test_split(X, y, test_size=0.3, random_seed=None):
    if random_seed:
        np.random.seed(random_seed)
    
    num_samples = len(X)
    test_samples = int(test_size * num_samples)
    
    indices = np.arange(num_samples)
    np.random.shuffle(indices)
    
    test_indices = indices[:test_samples]
    train_indices = indices[test_samples:]
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test

# Step 3: Implement Gaussian Naive Bayes

def calculate_mean_std(X):
    # Calculate mean and standard deviation for each feature in X
    num_features = X.shape[1]
    means = np.zeros(num_features)
    stds = np.zeros(num_features)
    
    for feature in range(num_features):
        feature_values = X[:, feature]
        means[feature] = np.mean(feature_values)
        stds[feature] = np.std(feature_values)
    
    return means, stds

def gaussian_probability(x, mean, std):
    # Calculate the Gaussian probability density function
    exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
    return (1 / (std * np.sqrt(2 * np.pi))) * exponent

def train_naive_bayes(X_train, y_train):
    # Calculate class priors
    unique_classes, class_counts = np.unique(y_train, return_counts=True)
    priors = class_counts / len(y_train)
    
    # Calculate mean and standard deviation for each feature and class
    num_classes = len(unique_classes)
    num_features = X_train.shape[1]
    means = np.zeros((num_classes, num_features))
    stds = np.zeros((num_classes, num_features))
    
    for i, class_label in enumerate(unique_classes):
        class_data = X_train[y_train == class_label]
        means[i, :], stds[i, :] = calculate_mean_std(class_data)
    
    return priors, means, stds

def predict_naive_bayes(X_test, priors, means, stds):
    num_classes = len(priors)
    num_samples = X_test.shape[0]
    predictions = np.zeros((num_samples, num_classes))
    
    for i in range(num_samples):
        for j in range(num_classes):
            class_prior = np.log(priors[j])
            likelihood = np.sum(np.log(gaussian_probability(X_test[i, :], means[j, :], stds[j, :])))
            predictions[i, j] = class_prior + likelihood
    
    return np.argmax(predictions, axis=1)

# Step 4: Train and test the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_seed=42)
priors, means, stds = train_naive_bayes(X_train, y_train)
y_pred = predict_naive_bayes(X_test, priors, means, stds)

# Step 5: Evaluate the model (calculate accuracy, confusion matrix, etc.)
# Implement evaluation metrics based on your specific problem and requirements.