In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

# 1. Veri setini yükle (lemmatized veri)
lemmatized_df = pd.read_csv('lemmatized_for_similarity.csv')  # Dosya yolunu kendine göre ayarla

# 2. Giriş metnini seç (örnek: ilk satır)
sample_text_lemmatized = lemmatized_df['content'].iloc[0]

# ===== TF-IDF MODELİ =====

# 3. TF-IDF modelini eğit
tfidf_lemmatized = TfidfVectorizer()
tfidf_lemmatized.fit(lemmatized_df['content'])

# 4. Veri seti ve giriş metni için TF-IDF vektörleri oluştur
tfidf_matrix = tfidf_lemmatized.transform(lemmatized_df['content'])
sample_vector = tfidf_lemmatized.transform([sample_text_lemmatized])

# 5. Cosine similarity hesapla
cos_similarities = cosine_similarity(sample_vector, tfidf_matrix)

# 6. En benzer 5 metni seç
top_5_idx = np.argsort(cos_similarities[0])[-5:][::-1]

print("TF-IDF ile en benzer 5 cümle:")
for idx in top_5_idx:
    print(f"Index: {idx}, Skor: {cos_similarities[0][idx]:.4f}")
    print(f"Cümle: {lemmatized_df['content'].iloc[idx]}")
    print('-' * 50)

# ===== WORD2VEC MODELİ =====

# 7. Word2Vec modelini eğit
sentences = [text.split() for text in lemmatized_df['content']]
model_lemma = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# 8. Cümleyi vektöre dönüştürmek için fonksiyon
def get_average_word2vec(text, model):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# 9. Giriş metni vektörü
sample_vec = get_average_word2vec(sample_text_lemmatized, model_lemma)

# 10. Tüm cümlelerin vektörlerini al ve benzerlik hesapla
similarities = []
for idx, sentence in enumerate(lemmatized_df['content']):
    sent_vec = get_average_word2vec(sentence, model_lemma)
    if np.linalg.norm(sample_vec) == 0 or np.linalg.norm(sent_vec) == 0:
        similarity = 0
    else:
        similarity = np.dot(sample_vec, sent_vec) / (np.linalg.norm(sample_vec) * np.linalg.norm(sent_vec))
    similarities.append((idx, similarity))

# 11. Benzerliklere göre sırala ve en yüksek 5’i yazdır
similarities.sort(key=lambda x: x[1], reverse=True)

print("\nWord2Vec ile en benzer 5 cümle:")
for idx, sim in similarities[:5]:
    print(f"Index: {idx}, Skor: {sim:.4f}")
    print(f"Cümle: {lemmatized_df['content'].iloc[idx]}")
    print('-' * 50)



TF-IDF ile en benzer 5 cümle:
Index: 0, Skor: 1.0000
Cümle: put cottage cheese wide bowl add egg sugar flour fork mix homogeneous mass turned sticky add flour half table small amount flour lay cottage two equal part roll sausage thickness sausage small identical piece sharp desired slightly add piece give rounded small pan bring water lower dumpling boiling water one stirring slightly slotted spoon dumpling come surface plus another finished dumpling pan plate pour jam example serve hot warm
--------------------------------------------------
Index: 32732, Skor: 0.5122
Cümle: use house cottage cheese advise put dish add sugar carefully help spoon roll process grinding slightly salt cottage cheese drive one egg cottage cheese knead sugar homogeneous divide flour half add half mixture cottage cheese egg leave second half process flying lazy dumpling little flour throat apply cottage cheese mass help hand slightly roll curd dough sausage dough got mode small piece sharp pour three liter wa