In [20]:
# Binary to Integer and Integer to Binary
from bitstring import BitArray
from pandas.io.sas.sas_constants import dataset_length

# Binary to Integer
binary_string = "00001101"
b = BitArray(bin=binary_string)
print(b.int)  # Output: 13

# Integer to Binary
integer_value = 44
b = BitArray(int=integer_value, length=8)
print(b.bin)  # Output: 00001101

13
00101100


In [40]:
# ASCII to Binary and Binary to ASCII
from bitstring import BitArray

# ASCII to Binary
ascii_text = "nice"
binary_bits = ''.join(BitArray(bytes=char.encode()).bin for char in ascii_text)
print(binary_bits)

# Binary to ASCII
binary_string = "0110010001110101001000000110101101101100011001010110100101101110011001010111001000100000011100000110100101101101011011010110010101101100"
ascii_text = ''.join(BitArray(bin=binary_string[i:i+8]).bytes.decode() for i in range(0, len(binary_string), 8))
print(ascii_text)

01101110011010010110001101100101
du kleiner pimmel


In [None]:
# Levenshtein Distance
import Levenshtein

word_1 = "mr. geil"
word_2 = "sieg heil"
distance = Levenshtein.distance(word_1, word_2)
print(distance)

In [8]:
# Ordinal / One-Hot Encoding
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder


data = {"Marke": ["saab", "bmw", "mitsubishi", "mitsubishi", "bmw"],
        "Form": ["limousine", "cabrio", "kombi", "kombi", "limousine"],
        "Preis": [20000, 30000, 25000, 25000, 30000]}
df_ordinal = pd.DataFrame(data)
df_onehot = pd.DataFrame(data)


# OrdinalEncoder for categorical columns
encoder = OrdinalEncoder()
df_ordinal[["Marke_ordinal", "Form_ordinal"]] = encoder.fit_transform(df[["Marke", "Form"]])

# One-Hot Encoding for categorical columns
df_onehot = pd.get_dummies(df_onehot, columns=["Marke", "Form"])

Unnamed: 0,Preis,Marke_bmw,Marke_mitsubishi,Marke_saab,Form_cabrio,Form_kombi,Form_limousine
0,20000,False,False,True,False,False,True
1,30000,True,False,False,True,False,False
2,25000,False,True,False,False,True,False
3,25000,False,True,False,False,True,False
4,30000,True,False,False,False,False,True


In [None]:
# Similarity

set_1 = (1, 0, 1, 0)
set_2 = (0, 0, 1, 1)

# Jaccard Similarity
from sklearn.metrics import jaccard_score
jaccard_similarity = jaccard_score(set_1, set_2)
print(f"Jaccard Similarity: {jaccard_similarity}")

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
cosine_similarity = cosine_similarity(np.array(set_1).reshape(1, -1), np.array(set_2).reshape(1, -1))
print(f"Cosine Similarity: {cosine_similarity}")

# Manhattan Distance
from scipy.spatial.distance import cityblock
manhattan_distance = cityblock(set_1, set_2)
print(f"Manhattan Distance: {manhattan_distance}")

In [22]:
# Cosine Similarity Matrix

from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load the CSV file with the first column as the index (row labels like 'A', 'B', etc.)
ratings = pd.read_csv("similarity.csv", delimiter=",", index_col=0)

# Ensure only numeric data is used for the calculation
numeric_data = ratings.select_dtypes(include=[float, int])

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(numeric_data)

# Create a DataFrame with similarity values
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=ratings.index, columns=ratings.index)

# Print the Cosine Similarity Matrix
print(cosine_sim_df)

          A         B         C         D         E         F         X
A  1.000000  0.650791  0.976187  0.997054  0.998868  0.795432  0.998868
B  0.650791  1.000000  0.800000  0.707107  0.613941  0.977802  0.613941
C  0.976187  0.800000  1.000000  0.989949  0.964764  0.907959  0.964764
D  0.997054  0.707107  0.989949  1.000000  0.992278  0.839570  0.992278
E  0.998868  0.613941  0.964764  0.992278  1.000000  0.765705  1.000000
F  0.795432  0.977802  0.907959  0.839570  0.765705  1.000000  0.765705
X  0.998868  0.613941  0.964764  0.992278  1.000000  0.765705  1.000000


In [20]:
# Manhattan Distance Matrix

from scipy.spatial.distance import cityblock
import pandas as pd

ratings = pd.read_csv("similarity.csv", delimiter=",", index_col=0)
numeric_data = ratings.select_dtypes(include=[float, int])

manhattan_distance_matrix = pd.DataFrame(index=ratings.index, columns=ratings.index)
for i in ratings.index:
    for j in ratings.index:
        manhattan_distance_matrix.loc[i, j] = cityblock(numeric_data.loc[i], numeric_data.loc[j])

print(manhattan_distance_matrix)

    A   B   C   D   E   F
A   0  11   4   3   5  10
B  11   0   7   8  10   9
C   4   7   0   1   7  12
D   3   8   1   0   6  11
E   5  10   7   6   0  11
F  10   9  12  11  11   0


In [12]:
# Setting up control group
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# Split the data into training and control sets
train_df, control_df = train_test_split(df, test_size=0.2, random_state=42)

# Add a "training" column to indicate the split
train_df['training'] = 1  # 1 for training
control_df['training'] = 0  # 0 for control

# Combine the two DataFrames back together
df_split = pd.concat([train_df, control_df]).sort_index()
df_split

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),training
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0
146,6.3,2.5,5.0,1.9,1
147,6.5,3.0,5.2,2.0,1
148,6.2,3.4,5.4,2.3,1
