# DATA MINING - WEEK 2
## NGUYEN XUAN VIET DUC - 22280012
### Lesson 2: Similarity and Distances

### I. Objectives:
After completing the practice, students will understand:
- The distance between points in numerical datasets using the `Lp` norm with `p = 1, 2, infinity` .
- The similarity of points in categorical datasets using the Overlap measure and the Inverse Document Frequency (IDF) measure.

### II. Practical Content

### 1. The distance between numerical data points

In [3]:
import pandas as pd
import numpy as np

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
ionosphere = fetch_ucirepo(id=52) 
  
# data (as pandas dataframes) 
X = ionosphere.data.features 
y = ionosphere.data.targets 
  
# metadata 
print(ionosphere.metadata) 
  
# variable information 
print(ionosphere.variables) 

{'uci_id': 52, 'name': 'Ionosphere', 'repository_url': 'https://archive.ics.uci.edu/dataset/52/ionosphere', 'data_url': 'https://archive.ics.uci.edu/static/public/52/data.csv', 'abstract': 'Classification of radar returns from the ionosphere', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 351, 'num_features': 34, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1989, 'last_updated': 'Sun Jan 01 1989', 'dataset_doi': '10.24432/C5W01B', 'creators': ['V. Sigillito', 'S. Wing', 'L. Hutton', 'K. Baker'], 'intro_paper': None, 'additional_info': {'summary': 'This radar data was collected by a system in Goose Bay, Labrador.  This system consists of a phased array of 16 high-frequency antennas with a total transmitted power on the order of 6.4 kilowatts.  See the paper for more details.  

In [6]:
print(X.isnull().values.any())

False


In [15]:
# Check for duplicates
duplicate_count = X.duplicated().sum()
print(duplicate_count)

# Remove duplicates
df_no_duplicates = X.drop_duplicates()

1


In [17]:
array = X.values
array

array([[ 1.     ,  0.     ,  0.99539, ..., -0.54487,  0.18641, -0.453  ],
       [ 1.     ,  0.     ,  1.     , ..., -0.06288, -0.13738, -0.02447],
       [ 1.     ,  0.     ,  1.     , ..., -0.2418 ,  0.56045, -0.38238],
       ...,
       [ 1.     ,  0.     ,  0.94701, ...,  0.00442,  0.92697, -0.00577],
       [ 1.     ,  0.     ,  0.90608, ..., -0.03757,  0.87403, -0.16243],
       [ 1.     ,  0.     ,  0.8471 , ..., -0.06678,  0.85764, -0.06151]])

In [19]:
point1 = array[0, :]
point2 = array[1, :]
point3 = array[2, :]

# p = 1
dist01_2 = np.linalg.norm(point1 - point2, 1)
dist01_3 = np.linalg.norm(point1 - point3, 1)

# p = 2
dist1_2 = np.linalg.norm(point1 - point2)
dist1_3 = np.linalg.norm(point1 - point3)

# p = inf
dist11_2 = np.linalg.norm(point1 - point2, np.inf)
dist11_3 = np.linalg.norm(point1 - point3, np.inf)

print(dist01_2)
print(dist01_3)
print(dist1_2)
print(dist1_3)
print(dist11_2)
print(dist11_3)

13.080950000000001
5.359709999999999
2.7763589251571923
1.1697276018372824
1.12221
0.45772


In [25]:
def calculate_norms(X, n_rows=50):

    array = X.values if isinstance(X, pd.DataFrame) else X
    array = array[:n_rows]

    row_pairs = []
    norm_1_distances = []
    norm_2_distances = []
    norm_inf_distances = []
    
    for i in range(len(array)):
        for j in range(i+1, len(array)):
            point_i = array[i, :]
            point_j = array[j, :]
            
            dist_p1 = np.linalg.norm(point_i - point_j, 1)      # p=1 (Manhattan)
            dist_p2 = np.linalg.norm(point_i - point_j)         # p=2 (Euclidean)
            dist_pinf = np.linalg.norm(point_i - point_j, np.inf)  # p=infinity (Chebyshev)
            
            row_pairs.append(f"({i}, {j})")
            norm_1_distances.append(dist_p1)
            norm_2_distances.append(dist_p2)
            norm_inf_distances.append(dist_pinf)
    
    results = pd.DataFrame({
        'Point Pair': row_pairs,
        'p=1 (Manhattan)': norm_1_distances,
        'p=2 (Euclidean)': norm_2_distances,
        'p=∞ (Chebyshev)': norm_inf_distances
    })
    
    return results

result = calculate_norms(X, 50)
print(result)

     Point Pair  p=1 (Manhattan)  p=2 (Euclidean)  p=∞ (Chebyshev)
0        (0, 1)         13.08095         2.776359          1.12221
1        (0, 2)          5.35971         1.169728          0.45772
2        (0, 3)         21.05729         4.772563          1.60536
3        (0, 4)          6.21387         1.377347          0.53525
4        (0, 5)         15.16631         3.049072          0.97202
...         ...              ...              ...              ...
1220   (46, 48)          2.74938         0.680880          0.26250
1221   (46, 49)         23.13494         5.038499          1.95288
1222   (47, 48)         16.76954         4.153002          1.98103
1223   (47, 49)         19.93674         4.810905          2.00000
1224   (48, 49)         21.72226         4.891912          1.93369

[1225 rows x 4 columns]


In [33]:
def calculate_distance_matrices(X, n_rows=50):

    # Get the first n_rows of the array
    array = X.values if isinstance(X, pd.DataFrame) else X
    array = array[:n_rows]
    
    # Initialize three distance matrices
    n = len(array)
    dist_p1 = np.zeros((n, n))    # Manhattan (p=1)
    dist_p2 = np.zeros((n, n))    # Euclidean (p=2)
    dist_pinf = np.zeros((n, n))  # Chebyshev (p=infinity)
    
    # Calculate distances for all pairs
    for i in range(n):
        for j in range(i, n):  # Start from i to include self (diagonal will be 0)
            if i == j:
                # Distance to self is 0 (already initialized with zeros)
                continue
                
            point_i = array[i, :]
            point_j = array[j, :]
            
            # Calculate distances using different norms
            p1_dist = np.linalg.norm(point_i - point_j, 1)      # p=1 (Manhattan)
            p2_dist = np.linalg.norm(point_i - point_j)         # p=2 (Euclidean)
            pinf_dist = np.linalg.norm(point_i - point_j, np.inf)  # p=infinity (Chebyshev)
            
            # Fill both sides of the symmetric matrices
            dist_p1[i, j] = dist_p1[j, i] = p1_dist
            dist_p2[i, j] = dist_p2[j, i] = p2_dist
            dist_pinf[i, j] = dist_pinf[j, i] = pinf_dist
    
    return dist_p1, dist_p2, dist_pinf

# Calculate distance matrices
dist_p1, dist_p2, dist_pinf = calculate_distance_matrices(X)

# Display basic information about the matrices
print(f"Manhattan distance matrix shape: {dist_p1.shape}")
print(f"Euclidean distance matrix shape: {dist_p2.shape}")
print(f"Chebyshev distance matrix shape: {dist_pinf.shape}")

# Print the matrices
print("\nManhattan distance matrix (p=1):")
np.set_printoptions(precision=2, suppress=True)  # Format for better readability
print(dist_p1)

print("\nEuclidean distance matrix (p=2):")
print(dist_p2)

print("\nChebyshev distance matrix (p=∞):")
print(dist_pinf)

Manhattan distance matrix shape: (50, 50)
Euclidean distance matrix shape: (50, 50)
Chebyshev distance matrix shape: (50, 50)

Manhattan distance matrix (p=1):
[[ 0.   12.65  5.29 ... 15.11  7.53 20.61]
 [12.65  0.   15.44 ... 17.09 17.68 18.94]
 [ 5.29 15.44  0.   ... 15.08  4.03 19.61]
 ...
 [15.11 17.09 15.08 ...  0.   16.11 19.52]
 [ 7.53 17.68  4.03 ... 16.11  0.   21.48]
 [20.61 18.94 19.61 ... 19.52 21.48  0.  ]]

Euclidean distance matrix (p=2):
[[0.   2.74 1.17 ... 3.64 1.58 4.38]
 [2.74 0.   3.36 ... 3.88 3.91 3.97]
 [1.17 3.36 0.   ... 3.74 0.96 4.34]
 ...
 [3.64 3.88 3.74 ... 0.   4.1  4.79]
 [1.58 3.91 0.96 ... 4.1  0.   4.89]
 [4.38 3.97 4.34 ... 4.79 4.89 0.  ]]

Chebyshev distance matrix (p=∞):
[[0.   1.12 0.46 ... 1.85 0.57 1.61]
 [1.12 0.   1.11 ... 1.93 1.45 1.63]
 [0.46 1.11 0.   ... 2.   0.43 1.55]
 ...
 [1.85 1.93 2.   ... 0.   1.98 2.  ]
 [0.57 1.45 0.43 ... 1.98 0.   1.93]
 [1.61 1.63 1.55 ... 2.   1.93 0.  ]]


### 2. The measurement of similarity between classification data points

In [35]:
import requests
import gzip
import shutil

url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz"
file_name = "kddcup.data.gz"

# Tải tệp từ URL
response = requests.get(url, stream=True)
with open(file_name, "wb") as file:
    for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)

print("File downloaded successfully!")

# Giải nén tệp .gz
with gzip.open(file_name, "rb") as f_in:
    with open("kddcup.data", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

print("File extracted successfully!")

File downloaded successfully!
File extracted successfully!


In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, KBinsDiscretizer
import warnings

df = pd.read_csv('kddcup.data', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.00,0.00,0.0,0.00,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.50,0.00,0.0,0.00,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.00,0.0,0.00,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.00,0.0,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4898426,0,tcp,http,SF,212,2288,0,0,0,0,...,255,1.0,0.0,0.33,0.05,0.0,0.01,0.0,0.0,normal.
4898427,0,tcp,http,SF,219,236,0,0,0,0,...,255,1.0,0.0,0.25,0.05,0.0,0.01,0.0,0.0,normal.
4898428,0,tcp,http,SF,218,3610,0,0,0,0,...,255,1.0,0.0,0.20,0.05,0.0,0.01,0.0,0.0,normal.
4898429,0,tcp,http,SF,219,1234,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.0,0.01,0.0,0.0,normal.


In [7]:
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

Index([1, 2, 3, 41], dtype='int64')

In [9]:
categorical_data = df[categorical_columns]
categorical_data

Unnamed: 0,1,2,3,41
0,tcp,http,SF,normal.
1,tcp,http,SF,normal.
2,tcp,http,SF,normal.
3,tcp,http,SF,normal.
4,tcp,http,SF,normal.
...,...,...,...,...
4898426,tcp,http,SF,normal.
4898427,tcp,http,SF,normal.
4898428,tcp,http,SF,normal.
4898429,tcp,http,SF,normal.


In [11]:
print("Duplicated data: ", categorical_data.duplicated().sum())

Duplicated data:  4897822


In [12]:
df1 = categorical_data.drop_duplicates()
df1

Unnamed: 0,1,2,3,41
0,tcp,http,SF,normal.
1288,tcp,http,S2,normal.
1484,tcp,http,S1,normal.
2062,tcp,smtp,SF,normal.
2067,udp,domain_u,SF,normal.
...,...,...,...,...
4574130,tcp,bgp,REJ,neptune.
4575586,tcp,Z39_50,REJ,neptune.
4578870,tcp,smtp,SF,neptune.
4579297,tcp,other,RSTO,neptune.


In [31]:
from collections import Counter

def overlap_distance(x, y):
    tmp = sum(xi != yi for xi, yi in zip(x, y))
    return tmp

In [33]:
def inverse_frequency_distance(x, y, value_frequencies):
    distance = 0
    for i, (xi, yi) in enumerate(zip(x, y)):
        if xi != yi:  # Only add to distance when values differ
            freq_x = value_frequencies[i].get(xi, 1e-10)  
            freq_y = value_frequencies[i].get(yi, 1e-10)
            distance += 1 / (1 + np.log(freq_x) * np.log(freq_y))
    
    return distance

In [35]:
def calculate_value_frequencies(data):
    """
    Calculate the frequency of each value in each column of the dataset
    """
    n_samples = len(data)
    value_frequencies = []
    
    for col in range(data.shape[1]):
        # Count occurrences of each value in the column
        counts = Counter(data[:, col])
        # Convert to frequencies
        freqs = {val: count/n_samples for val, count in counts.items()}
        value_frequencies.append(freqs)
    
    return value_frequencies

In [45]:
sample_size = 100
df_sample = df1.head(sample_size)  

data_array = df_sample.values 

value_frequencies = calculate_value_frequencies(data_array)

overlap_dist_matrix = np.zeros((sample_size, sample_size))
for i in range(sample_size):
    for j in range(i, sample_size):
        distance = overlap_distance(data_array[i], data_array[j])
        overlap_dist_matrix[i, j] = overlap_dist_matrix[j, i] = distance

if_dist_matrix = np.zeros((sample_size, sample_size))
for i in range(sample_size):
    for j in range(i, sample_size):
        distance = inverse_frequency_distance(data_array[i], data_array[j], value_frequencies)
        if_dist_matrix[i, j] = if_dist_matrix[j, i] = distance

# Print distance matrices
print("\nOverlap Distance Matrix (first 10x10):")
print(overlap_dist_matrix[:10, :10])

print("\nInverse Frequency Distance Matrix (first 10x10):")
print(if_dist_matrix[:10, :10])


Overlap Distance Matrix (first 10x10):
[[0. 1. 1. 1. 2. 1. 1. 1. 2. 2.]
 [1. 0. 1. 2. 3. 2. 2. 2. 3. 3.]
 [1. 1. 0. 2. 3. 2. 2. 2. 3. 3.]
 [1. 2. 2. 0. 2. 1. 1. 1. 2. 2.]
 [2. 3. 3. 2. 0. 2. 2. 2. 2. 3.]
 [1. 2. 2. 1. 2. 0. 1. 1. 2. 2.]
 [1. 2. 2. 1. 2. 1. 0. 1. 2. 2.]
 [1. 2. 2. 1. 2. 1. 1. 0. 2. 1.]
 [2. 3. 3. 2. 2. 2. 2. 2. 0. 3.]
 [2. 3. 3. 2. 3. 2. 2. 1. 3. 0.]]

Inverse Frequency Distance Matrix (first 10x10):
[[0.         0.14572536 0.19617389 0.12862497 0.8564846  0.08271983
  0.10589181 0.1691263  0.88285491 0.31137983]
 [0.14572536 0.         0.06319726 0.27435033 1.00220996 0.22844519
  0.25161717 0.31485166 1.02858027 0.45710519]
 [0.19617389 0.06319726 0.         0.32479886 1.05265849 0.27889372
  0.3020657  0.36530019 1.0790288  0.50755372]
 [0.12862497 0.27435033 0.32479886 0.         0.84541733 0.07165256
  0.09203537 0.14836841 0.87178764 0.29062194]
 [0.8564846  1.00220996 1.05265849 0.84541733 0.         0.81879441
  0.8320796  0.86995965 0.13195058 1.01221317]
 [0.