Adım 1: MI Hesaplama ve En İyi İki Kelimeyi Seçme


In [1]:
import numpy as np
from collections import Counter
from math import log2

# Eğitim verisi
documents = [
    ("free free free buy discount combo pleasure", 'S'),
    ("free free free discount pleasure smile smile smile", 'S'),
    ("cat mouse", 'N'),
    ("cat cat dog dog dog dog", 'N'),
    ("mouse", 'N')
]

# Kelime sayısı
word_counts = Counter(word for doc, _ in documents for word in doc.split())

# Kelime sayısı ve sınıf dağılımı
class_word_counts = {'S': Counter(), 'N': Counter()}
for doc, label in documents:
    words = doc.split()
    for word in words:
        class_word_counts[label][word] += 1

# MI hesaplama
def calculate_mi(word, class_word_counts, word_counts, total_docs, class_docs):
    mi = 0.0
    for label in class_word_counts.keys():
        p_wc = (class_word_counts[label][word] + 1) / (total_docs + 1)
        p_w = (word_counts[word] + 1) / (total_docs + 1)
        p_c = class_docs[label] / total_docs
        mi += p_wc * log2(p_wc / (p_w * p_c))
    return mi

total_docs = len(documents)
class_docs = {label: sum(1 for _, l in documents if l == label) for label in class_word_counts.keys()}
mi_scores = {word: calculate_mi(word, class_word_counts, word_counts, total_docs, class_docs) for word in word_counts}

# En yüksek iki MI skoru
top_two_words = sorted(mi_scores, key=mi_scores.get, reverse=True)[:2]
print("Top two words by MI:", top_two_words)


Top two words by MI: ['free', 'smile']


Adım 2: TF*IDF Hesaplama


In [2]:
def calculate_tf(word, doc):
    words = doc.split()
    return words.count(word) / len(words)

def calculate_idf(word, documents):
    doc_count = sum(1 for doc, _ in documents if word in doc.split())
    return log2(len(documents) / (1 + doc_count))

# Seçilen iki kelime için TF*IDF hesaplama
tfidf_scores = []
for doc, label in documents:
    scores = []
    for word in top_two_words:
        tf = calculate_tf(word, doc)
        idf = calculate_idf(word, documents)
        scores.append(tf * idf)
    tfidf_scores.append(scores)

print("TF*IDF Scores:", tfidf_scores)


TF*IDF Scores: [[0.31584239749980264, 0.0], [0.27636209781232735, 0.4957230355827609], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]


Adım 3: Belgeleri Temsil Etme


In [3]:
import pandas as pd

# Belgeleri TF*IDF değerleri ile temsil etme
df = pd.DataFrame(tfidf_scores, columns=top_two_words)
print("TF*IDF Representation:\n", df)


TF*IDF Representation:
        free     smile
0  0.315842  0.000000
1  0.276362  0.495723
2  0.000000  0.000000
3  0.000000  0.000000
4  0.000000  0.000000


Adım 4: Test Verileri İçin TF*IDF Hesaplama



In [4]:
test_documents = ["dog cat mouse cat", "Free free smile"]

# Test verileri için TF*IDF hesaplama
test_tfidf_scores = []
for doc in test_documents:
    scores = []
    for word in top_two_words:
        tf = calculate_tf(word, doc)
        idf = calculate_idf(word, documents)
        scores.append(tf * idf)
    test_tfidf_scores.append(scores)

print("Test TF*IDF Scores:", test_tfidf_scores)


Test TF*IDF Scores: [[0.0, 0.0], [0.24565519805540206, 0.44064269829578745]]


Adım 5: KNN Algoritması ile Sınıf Tahmini


In [6]:
from sklearn.neighbors import KNeighborsClassifier

# Eğitim verisi ve sınıf etiketleri
X_train = np.array(tfidf_scores)
y_train = np.array([label for _, label in documents])

# KNN model oluşturma ve eğitme
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Test verisi ve tahmin sonuçları
X_test = np.array(test_tfidf_scores)
predictions = knn.predict(X_test)

# Test verileri ve tahmin edilen sınıflar
test_documents_with_predictions = {
    'd6': predictions[0],
    'd7': predictions[1]
}

for doc, prediction in test_documents_with_predictions.items():
    print(f"Prediction for {doc}: {prediction}")


Prediction for d6: N
Prediction for d7: S
