In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/test-file/tested.csv


In [2]:
titanic_data = pd.read_csv('/kaggle/input/test-file/tested.csv')  # Update the filename with the actual file path

print(titanic_data.head())

print(titanic_data.describe())

print(titanic_data.info())

   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  
       PassengerId    Survived      Pclass         Age       SibSp  \
count   418.0000

# **Clean the Data:**

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Impute missing values in 'Age' and 'Fare' with the mean
imputer = SimpleImputer(strategy='mean')
titanic_data['Age'] = imputer.fit_transform(titanic_data[['Age']])
titanic_data['Fare'] = imputer.fit_transform(titanic_data[['Fare']])

# Drop rows with missing values in other columns
titanic_data = titanic_data.dropna()

# Drop unnecessary features (e.g., 'PassengerId', 'Name', 'Ticket')
titanic_data = titanic_data.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

# Standardize numerical features (e.g., 'Age' and 'Fare')
scaler = StandardScaler()
titanic_data[['Age', 'Fare']] = scaler.fit_transform(titanic_data[['Age', 'Fare']])

# Convert categorical features to numerical using one-hot encoding
titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Embarked'], drop_first=True)

# Display cleaned dataset
print(titanic_data.head())

    Survived  Pclass       Age  SibSp  Parch      Fare            Cabin  \
12         1       1 -1.063317      1      0 -0.145102              B45   
14         1       1  0.546484      1      0 -0.387294              E31   
24         1       1  0.613559      1      3  1.923046  B57 B59 B63 B66   
26         1       1 -1.130392      0      1 -0.378059              B36   
28         0       1  0.144034      0      0 -0.739529              A21   

    Sex_male  Embarked_Q  Embarked_S  
12     False       False        True  
14     False       False        True  
24     False       False       False  
26     False       False       False  
28      True       False        True  


# **Measure Information Gain:**

In [4]:
from sklearn.feature_selection import mutual_info_classif

# Drop 'Survived' column to get the feature matrix
X = titanic_data.drop('Survived', axis=1)

# Get the target variable
y = titanic_data['Survived']

# Calculate mutual information scores for each feature
mutual_info_scores = mutual_info_classif(X, y, random_state=42)

# Create a DataFrame to display the scores
feature_scores = pd.DataFrame({'Feature': X.columns, 'Mutual_Info_Score': mutual_info_scores})
feature_scores = feature_scores.sort_values(by='Mutual_Info_Score', ascending=False)

# Display feature scores
print(feature_scores)

ValueError: could not convert string to float: 'B45'

# **Test Different Configurations with K-NN Algorithm:**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
# Get the number of features in your dataset
num_features = X.shape[1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to evaluate k-NN model with different configurations
def evaluate_knn(n_features, n_neighbors, metric):
    # Ensure n_features is less than or equal to the number of features
    if n_features > num_features:
        raise ValueError(f"n_features should be <= {num_features}; got {n_features}.")

    # Select the top 'n_features' using ANOVA F-statistic
    selector = SelectKBest(score_func=f_classif, k=n_features)
    X_train_selected = selector.fit_transform(X_train_scaled, y_train)
    X_test_selected = selector.transform(X_test_scaled)

    # Build k-NN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric)
    knn.fit(X_train_selected, y_train)

    # Make predictions
    y_pred = knn.predict(X_test_selected)

    # Evaluate precision
    precision = precision_score(y_test, y_pred)

    return precision

# Test different configurations
feature_counts = [5, 7, 8]  # Adjusted to the actual number of features
k_values = [3, 5, 7]  # Different values of k
metrics = ['euclidean', 'manhattan']  # Different similarity measures

# Iterate over configurations
for n_features in feature_counts:
    for k in k_values:
        for metric in metrics:
            precision = evaluate_knn(n_features, k, metric)
            print(f"Features: {n_features}, k: {k}, Metric: {metric}, Precision: {precision:.4f}")

# **Visualize and Interpret Results:**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Test different configurations
feature_counts = [5, 7, 8]  # Adjusted to the actual number of features
k_values = [3, 5, 7]  # Different values of k
metrics = ['euclidean', 'manhattan']  # Different similarity measures

# Store results
results = []

# Iterate over configurations
for n_features in feature_counts:
    for k in k_values:
        for metric in metrics:
            precision = evaluate_knn(n_features, k, metric)
            results.append({'Features': n_features, 'k': k, 'Metric': metric, 'Precision': precision})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Visualize results using a heatmap
plt.figure(figsize=(12, 8))
pivot_table = results_df.pivot_table(index='Features', columns=['k', 'Metric'], values='Precision')
sns.heatmap(pivot_table, annot=True, cmap='viridis', fmt=".3f", linewidths=.5)
plt.title('Precision of k-NN Model for Different Configurations')
plt.show()
