In [28]:
import pandas as pd
import numpy as np
import math

In [29]:
# Load the Titanic dataset
df = pd.read_excel('Lab2_titanic.xlsx')
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

Dataset shape: (891, 12)

First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0       

In [30]:
# Missing values
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [41]:
df_clean = df.copy()

# Remove Cabin column
df_clean = df_clean.drop('Cabin', axis=1)
print("Removed Cabin column from the dataset")

# For numeric columns (using mean)
df_clean['Age'] = df_clean.groupby(['Sex','Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))

# For categorical columns (using mode)
categorical_columns = df_clean.select_dtypes(include=['object']).columns
for col in categorical_columns:
    if df_clean[col].isnull().sum() > 0:
        mode_val = df_clean[col].mode()[0] if not df_clean[col].mode().empty else 'Unknown'
        df_clean[col] = df_clean[col].fillna(mode_val)
        print(f"Filled {col} missing values with mode: {mode_val}")

print("\nMissing values after handling:")
print(df_clean.isnull().sum())

Removed Cabin column from the dataset
Filled Embarked missing values with mode: S

Missing values after handling:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


## 2. Feature Scaling

In [43]:
# Select specific numeric features for scaling
selected_features = ['Age', 'Fare']
print("Selected features for normalization:", selected_features)

# Verify all selected features exist in the dataset
available_features = [col for col in selected_features if col in df_clean.columns]
print("Available features in dataset:", available_features)

# Take first 10 rows of selected features for demonstration
sample_data = df_clean[available_features].iloc[:10]
print("\nSample data for scaling:")
print(sample_data)

Selected features for normalization: ['Age', 'Fare']
Available features in dataset: ['Age', 'Fare']

Sample data for scaling:
    Age     Fare
0  22.0   7.2500
1  38.0  71.2833
2  26.0   7.9250
3  35.0  53.1000
4  35.0   8.0500
5  25.0   8.4583
6  54.0  51.8625
7   2.0  21.0750
8  27.0  11.1333
9  14.0  30.0708


### Min-Max Normalization (Manual Implementation)
Formula: X_normalized = (X - X_min) / (X_max - X_min)

In [44]:
def min_max_normalize(data):
    """
    Manual implementation of Min-Max normalization
    Formula: (x - min) / (max - min)
    """
    normalized_data = data.copy().astype(float)
    
    for column in data.columns:
        col_min = data[column].min()
        col_max = data[column].max()
        
        print(f"\n{column}: min={col_min}, max={col_max}")
        
        # Apply min-max formula
        for i in data.index:
            original_val = data.loc[i, column]
            normalized_val = (original_val - col_min) / (col_max - col_min)
            normalized_data.loc[i, column] = normalized_val
    
    return normalized_data

# Apply Min-Max normalization
minmax_normalized = min_max_normalize(sample_data)
print("\nMin-Max Normalized Data:")
print(minmax_normalized)


Age: min=2.0, max=54.0

Fare: min=7.25, max=71.2833

Min-Max Normalized Data:
        Age      Fare
0  0.384615  0.000000
1  0.692308  1.000000
2  0.461538  0.010541
3  0.634615  0.716034
4  0.634615  0.012493
5  0.442308  0.018870
6  1.000000  0.696708
7  0.000000  0.215903
8  0.480769  0.060645
9  0.230769  0.356390


### Z-Score Standardization (Manual Implementation)
Formula: X_standardized = (X - μ) / σ

In [46]:
def z_score_standardize(data):
    standardized_data = data.copy().astype(float)
    
    for column in data.columns:
        # Calculate mean manually
        col_sum = sum(data[column])
        col_mean = col_sum / len(data[column])
        
        # Calculate standard deviation manually
        squared_diffs = [(x - col_mean) ** 2 for x in data[column]]
        variance = sum(squared_diffs) / (len(data[column]) - 1)  # Sample standard deviation
        col_std = math.sqrt(variance)
        
        print(f"\n{column}: mean={col_mean:.3f}, std={col_std:.3f}")
        
        # Apply z-score formula
        for i in data.index:
            original_val = data.loc[i, column]
            standardized_val = (original_val - col_mean) / col_std
            standardized_data.loc[i, column] = standardized_val
    
    return standardized_data

# Apply Z-score standardization
zscore_standardized = z_score_standardize(sample_data)
print("\nZ-Score Standardized Data:")
print(zscore_standardized)


Age: mean=27.800, std=14.125

Fare: mean=27.021, std=23.602

Z-Score Standardized Data:
        Age      Fare
0 -0.410624 -0.837678
1  0.722132  1.875375
2 -0.127435 -0.809078
3  0.509740  1.104959
4  0.509740 -0.803782
5 -0.198232 -0.786483
6  1.854888  1.052527
7 -1.826569 -0.251921
8 -0.056638 -0.673145
9 -0.977002  0.129226


## 3. Similarity and Dissimilarity Measures

In [35]:
# Prepare sample vectors for similarity calculations
# Take first two rows of normalized data as sample vectors
vector1 = minmax_normalized.iloc[0].values
vector2 = minmax_normalized.iloc[1].values

print("Vector 1:", vector1)
print("Vector 2:", vector2)

Vector 1: [0. 0. 1.]
Vector 2: [0.11111111 1.         0.        ]


### 1. Pearson's Correlation (Manual Implementation)
Formula: r = Σ[(xi - x̄)(yi - ȳ)] / √[Σ(xi - x̄)² × Σ(yi - ȳ)²]

In [36]:
def pearson_correlation(x, y):
    """
    Manual implementation of Pearson's correlation coefficient
    """
    n = len(x)
    
    # Calculate means
    x_mean = sum(x) / n
    y_mean = sum(y) / n
    
    print(f"Mean of x: {x_mean:.4f}")
    print(f"Mean of y: {y_mean:.4f}")
    
    # Calculate numerator and denominators
    numerator = 0
    sum_x_squared = 0
    sum_y_squared = 0
    
    for i in range(n):
        x_diff = x[i] - x_mean
        y_diff = y[i] - y_mean
        
        numerator += x_diff * y_diff
        sum_x_squared += x_diff ** 2
        sum_y_squared += y_diff ** 2
    
    # Calculate correlation coefficient
    denominator = math.sqrt(sum_x_squared * sum_y_squared)
    
    if denominator == 0:
        return 0
    
    correlation = numerator / denominator
    
    print(f"Numerator: {numerator:.4f}")
    print(f"Denominator: {denominator:.4f}")
    
    return correlation

# Calculate Pearson's correlation
pearson_corr = pearson_correlation(vector1, vector2)
print(f"\nPearson's Correlation: {pearson_corr:.4f}")

Mean of x: 0.3333
Mean of y: 0.3704
Numerator: -0.3704
Denominator: 0.6329

Pearson's Correlation: -0.5852


### 2. Cosine Similarity (Manual Implementation)
Formula: cosine_similarity = (A · B) / (||A|| × ||B||)

In [37]:
def cosine_similarity(x, y):
    """
    Manual implementation of cosine similarity
    """
    # Calculate dot product
    dot_product = 0
    for i in range(len(x)):
        dot_product += x[i] * y[i]
    
    # Calculate magnitudes
    magnitude_x = 0
    magnitude_y = 0
    
    for i in range(len(x)):
        magnitude_x += x[i] ** 2
        magnitude_y += y[i] ** 2
    
    magnitude_x = math.sqrt(magnitude_x)
    magnitude_y = math.sqrt(magnitude_y)
    
    print(f"Dot product: {dot_product:.4f}")
    print(f"Magnitude of x: {magnitude_x:.4f}")
    print(f"Magnitude of y: {magnitude_y:.4f}")
    
    # Calculate cosine similarity
    if magnitude_x == 0 or magnitude_y == 0:
        return 0
    
    cosine_sim = dot_product / (magnitude_x * magnitude_y)
    return cosine_sim

# Calculate Cosine similarity
cosine_sim = cosine_similarity(vector1, vector2)
print(f"\nCosine Similarity: {cosine_sim:.4f}")

Dot product: 0.0000
Magnitude of x: 1.0000
Magnitude of y: 1.0062

Cosine Similarity: 0.0000


### 3. Jaccard Similarity (Manual Implementation)
Formula: J(A,B) = |A ∩ B| / |A ∪ B|
Note: For continuous data, we'll convert to binary using a threshold

In [38]:
def jaccard_similarity(x, y, threshold=0.5):
    """
    Manual implementation of Jaccard similarity
    Convert continuous values to binary using threshold
    """
    # Convert to binary sets
    set_x = set()
    set_y = set()
    
    for i in range(len(x)):
        if x[i] > threshold:
            set_x.add(i)
        if y[i] > threshold:
            set_y.add(i)
    
    print(f"Binary set x (indices > {threshold}): {set_x}")
    print(f"Binary set y (indices > {threshold}): {set_y}")
    
    # Calculate intersection and union manually
    intersection = set()
    union = set()
    
    # Find intersection
    for item in set_x:
        if item in set_y:
            intersection.add(item)
    
    # Find union
    for item in set_x:
        union.add(item)
    for item in set_y:
        union.add(item)
    
    print(f"Intersection: {intersection}")
    print(f"Union: {union}")
    
    # Calculate Jaccard similarity
    if len(union) == 0:
        return 0
    
    jaccard_sim = len(intersection) / len(union)
    return jaccard_sim

# Calculate Jaccard similarity
jaccard_sim = jaccard_similarity(vector1, vector2)
print(f"\nJaccard Similarity: {jaccard_sim:.4f}")

Binary set x (indices > 0.5): {2}
Binary set y (indices > 0.5): {1}
Intersection: set()
Union: {1, 2}

Jaccard Similarity: 0.0000


### 4. Euclidean Distance (Manual Implementation)
Formula: d(p,q) = √[Σ(pi - qi)²]

In [39]:
def euclidean_distance(x, y):
    """
    Manual implementation of Euclidean distance
    """
    sum_squared_diff = 0
    
    print("Point-wise differences:")
    for i in range(len(x)):
        diff = x[i] - y[i]
        squared_diff = diff ** 2
        sum_squared_diff += squared_diff
        print(f"  Point {i}: ({x[i]:.3f} - {y[i]:.3f})² = {squared_diff:.6f}")
    
    print(f"Sum of squared differences: {sum_squared_diff:.6f}")
    
    # Calculate Euclidean distance
    euclidean_dist = math.sqrt(sum_squared_diff)
    return euclidean_dist

# Calculate Euclidean distance
euclidean_dist = euclidean_distance(vector1, vector2)
print(f"\nEuclidean Distance: {euclidean_dist:.4f}")

Point-wise differences:
  Point 0: (0.000 - 0.111)² = 0.012346
  Point 1: (0.000 - 1.000)² = 1.000000
  Point 2: (1.000 - 0.000)² = 1.000000
Sum of squared differences: 2.012346

Euclidean Distance: 1.4186


## Summary of Results

In [40]:
print("=" * 50)
print("SUMMARY OF SIMILARITY AND DISTANCE MEASURES")
print("=" * 50)
print(f"Pearson's Correlation:  {pearson_corr:.4f}")
print(f"Cosine Similarity:      {cosine_sim:.4f}")
print(f"Jaccard Similarity:     {jaccard_sim:.4f}")
print(f"Euclidean Distance:     {euclidean_dist:.4f}")
print("=" * 50)

print("\nInterpretation:")
print("- Higher correlation/similarity values indicate more similar vectors")
print("- Lower distance values indicate more similar vectors")
print("- Pearson correlation ranges from -1 to 1")
print("- Cosine similarity ranges from -1 to 1")
print("- Jaccard similarity ranges from 0 to 1")
print("- Euclidean distance ranges from 0 to infinity")

SUMMARY OF SIMILARITY AND DISTANCE MEASURES
Pearson's Correlation:  -0.5852
Cosine Similarity:      0.0000
Jaccard Similarity:     0.0000
Euclidean Distance:     1.4186

Interpretation:
- Higher correlation/similarity values indicate more similar vectors
- Lower distance values indicate more similar vectors
- Pearson correlation ranges from -1 to 1
- Cosine similarity ranges from -1 to 1
- Jaccard similarity ranges from 0 to 1
- Euclidean distance ranges from 0 to infinity
