In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
def preprocess_data(df):
    # Drop unnecessary columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    # Convert categorical variables to numerical
    df['Sex'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    # Handle missing values
    # For example, fill missing ages with the median age
    df['Age'].fillna(df['Age'].median(), inplace=True)
    # For example, fill missing embarked values with the most frequent value
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    # Normalize numerical features
    df['Age'] = (df['Age'] - df['Age'].mean()) / df['Age'].std()
    df['Fare'] = (df['Fare'] - df['Fare'].mean()) / df['Fare'].std()
    return df


In [None]:
def calculate_class_probabilities(y_train):
    class_probabilities = {}
    total_samples = len(y_train)
    unique_classes = np.unique(y_train)
    for class_ in unique_classes:
        class_samples = np.sum(y_train == class_)
        class_probabilities[class_] = class_samples / total_samples
    return class_probabilities

In [None]:
def calculate_feature_probabilities(x_train, y_train):
    feature_probabilities = {}
    unique_classes = np.unique(y_train)
    for class_ in unique_classes:
        class_indices = np.where(y_train == class_)[0]
        class_features = x_train[class_indices]
        feature_probabilities[class_] = {
            'mean': np.mean(class_features, axis=0),
            'std': np.std(class_features, axis=0)
        }
    return feature_probabilities

In [None]:
def calculate_likelihood(x, mean, std):
    exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
    likelihood = (1 / (np.sqrt(2 * np.pi) * std)) * exponent
    return likelihood

In [None]:
def predict_naive_bayes(x_test, class_probabilities, feature_probabilities):
    predictions = []
    for x in x_test:
        max_posterior = float('-inf')
        predicted_class = None
        for class_, class_probability in class_probabilities.items():
            class_feature_probs = feature_probabilities[class_]
            posterior = np.log(class_probability)
            for i in range(len(x)):
                likelihood = calculate_likelihood(x[i], class_feature_probs['mean'][i], class_feature_probs['std'][i])
                posterior += np.log(likelihood)
            if posterior > max_posterior:
                max_posterior = posterior
                predicted_class = class_
        predictions.append(predicted_class)
    return predictions


In [None]:
def calculate_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

In [None]:
def get_k_nearest_neighbors(x_train, y_train, x_test, k):
    distances = []
    for i in range(len(x_train)):
        dist = calculate_distance(x_train[i], x_test)
        distances.append((dist, y_train[i]))
    distances.sort(key=lambda x: x[0])
    neighbors = [distance[1] for distance in distances[:k]]
    return neighbors


In [None]:
def predict_knn(x_train, y_train, x_test, k):
    predictions = []
    for x in x_test:
        neighbors = get_k_nearest_neighbors(x_train, y_train, x, k)
        unique_classes, class_counts = np.unique(neighbors, return_counts=True)
        predicted_class = unique_classes[np.argmax(class_counts)]
        predictions.append(predicted_class)
    return predictions

In [None]:
# Load the dataset
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
gender_submission_df =pd.read_csv('/content/gender_submission.csv')

In [None]:
# Preprocess the data
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
#gender_submission_df= preprocess_data(gender_submission_df)

In [None]:
# Split the dataset into features (x) and target variable (y)
x_train = train_df.drop('Survived', axis=1).values
y_train = train_df['Survived'].values
x_test = test_df.values
y_test = gender_submission_df['Survived'].values
print (len(y_test))

418


In [None]:
# Train and predict using Naive Bayes
class_probabilities = calculate_class_probabilities(y_train)
feature_probabilities = calculate_feature_probabilities(x_train, y_train)
nb_predictions = predict_naive_bayes(x_test, class_probabilities, feature_probabilities)
print(nb_predictions)

[0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, None, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 

In [None]:
# Train and predict using k-NN
k = 37  # Example value for k
knn_predictions = predict_knn(x_train, y_train, x_test, k)

In [None]:
# Print the predictions (example for k-NN)
print('k-NN Predictions:', knn_predictions)

k-NN Predictions: [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 

In [None]:
#accuracy for KNN
correct=0
for i in range (0,len(y_test)):
  if y_test[i]==knn_predictions[i]:
    correct+=1
accuracy=correct/len(y_test)
print(accuracy)

0.84688995215311


In [None]:
#accuracy for Naive Bayes
correct=0
for i in range (0,len(y_test)):
  if y_test[i]==nb_predictions[i]:
    correct+=1
accuracy=correct/len(y_test)
print(accuracy)

0.930622009569378
