# Install Annoy

In [1]:
# Step 1: Install Annoy
!pip install annoy

[0m

# Import necessary libraries

In [2]:
# Step 2: Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from annoy import AnnoyIndex
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset

In [3]:
# Step 3: Load the dataset
customers_df = pd.read_csv("/kaggle/input/fake-turkish-names-csv-format/Customers_100K_Row.csv")

# Preprocess the data

In [4]:
# Step 4: Preprocess the data
customers_df["CHAR_NAMESURNAME"] = customers_df["NAMESURNAME"].apply(lambda x: " ".join([c for c in x]))

# Vectorize the data using TF-IDF

In [5]:
# Step 5: Vectorize the data using TF-IDF
def preprocess(text):
    return nltk.word_tokenize(text)

vectorizer = TfidfVectorizer(tokenizer=preprocess, ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(customers_df["CHAR_NAMESURNAME"])

# Build the Annoy index

In [6]:
# Step 6: Build the Annoy index
f = tfidf_matrix.shape[1]
t = AnnoyIndex(f, 'angular')

for i in range(tfidf_matrix.shape[0]):
    v = tfidf_matrix[i].toarray()[0]
    t.add_item(i, v)

t.build(10)  # 10 trees

True

# Query the index

In [7]:
# Step 7: Query the index
def find_nearest_neighbors(index, vec, k=10):
    indices = index.get_nns_by_vector(vec, k)
    return customers_df.iloc[indices]

query_index = 0  # Index of the query point
query_vector = tfidf_matrix[query_index].toarray()[0]
k = 5  # Number of nearest neighbors

nearest_neighbors = find_nearest_neighbors(t, query_vector, k)
print(nearest_neighbors)

          ID   NAME_ SURNAME    NAMESURNAME GENDER   BIRTHDATE  \
0          1  Melike  ÖZİPEK  Melike ÖZİPEK      K  1970-02-13   
43122  43123   Asiye  ÖZİPEK   Asiye ÖZİPEK      K  1986-10-15   
77774  77775  Melike    ÖZER    Melike ÖZER      K  1985-01-13   
25705  25706  Melike    ÖZAY    Melike ÖZAY      K  1967-08-31   
55642  55643  Melike   ÖZDAL   Melike ÖZDAL      K  1995-10-16   

                                       EMAIL     TCNUMBER        TELNR  \
0                   mel_ozipek@fakeyahoo.com  42151320682  555-4182308   
43122             asi_ozipek@fakehotmail.com  74335685034  555-1315279   
77774                 mel_ozer@fakeyahoo.com  81562064518  536-1969269   
25705               mel_ozay@fakehotmail.com  88802058818  532-2273035   
55642  mel_ozdal@fakegmail.com@fakeyahoo.com  44758047672  554-1959916   

             CITY      TOWN         DISTRICT                   STREET  \
0       Gaziantep   OĞUZELİ     ÇAYBAŞI MAH.  BESİ ORG.SAN.79 CADDESİ   
43122    İst

# Defining the simlarity metric

In [8]:
# Step 8: Defining the simlarity metric

from sklearn.metrics.pairwise import cosine_similarity

def find_exact_nearest_neighbors(tfidf_matrix, query_index, k=10):
    similarities = cosine_similarity(tfidf_matrix[query_index], tfidf_matrix)
    sorted_indices = np.argsort(-similarities[0])
    return customers_df.iloc[sorted_indices[:k]]


# Getting Precision and Recall

In [9]:
# Step 9: Getting Precision and Recall


import numpy as np

num_queries = 100
k = 5
precision = 0
recall = 0

for query_index in range(num_queries):
    query_vector = tfidf_matrix[query_index].toarray()[0]

    approx_neighbors = find_nearest_neighbors(t, query_vector, k)
    exact_neighbors = find_exact_nearest_neighbors(tfidf_matrix, query_index, k)

    approx_neighbors_set = set(approx_neighbors.index)
    exact_neighbors_set = set(exact_neighbors.index)

    true_positives = len(approx_neighbors_set.intersection(exact_neighbors_set))
    false_positives = k - true_positives
    false_negatives = k - true_positives

    precision += true_positives / (true_positives + false_positives)
    recall += true_positives / (true_positives + false_negatives)

precision /= num_queries
recall /= num_queries

print(f"Precision: {precision}")
print(f"Recall: {recall}")


Precision: 0.6079999999999999
Recall: 0.6079999999999999
