In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Lemmatized ve Stemmed verilerini yükleyin
lemmatized_df = pd.read_csv('lemmatized_for_similarity.csv')  # Burada doğru dosya yolunu kullandığınızdan emin olun
stemmed_df = pd.read_csv('stemmed_for_similarity.csv')  # Burada doğru dosya yolunu kullandığınızdan emin olun



In [2]:
# 2. Giriş metnini seçin (örneğin, ilk satırdan)
sample_text_lemmatized = lemmatized_df['content'].iloc[0]  # Lemmatized giriş metni
sample_text_stemmed = stemmed_df['content'].iloc[0]  # Stemmed giriş metni

print("Seçilen Giriş Metni (Lemmatized):", sample_text_lemmatized)
print("Seçilen Giriş Metni (Stemmed):", sample_text_stemmed)


Seçilen Giriş Metni (Lemmatized): put cottage cheese wide bowl add egg sugar flour fork mix homogeneous mass turned sticky add flour half table small amount flour lay cottage two equal part roll sausage thickness sausage small identical piece sharp desired slightly add piece give rounded small pan bring water lower dumpling boiling water one stirring slightly slotted spoon dumpling come surface plus another finished dumpling pan plate pour jam example serve hot warm
Seçilen Giriş Metni (Stemmed): put cottag chees wide bowl add egg sugar flour fork mix homogen mass turn sticki add flour half tabl small amount flour lay cottag two equal part roll sausag thick sausag small ident piec sharp desir slightli add piec give round small pan bring water lower dumpl boil water one stir slightli slot spoon dumpl come surfac plu anoth finish dumpl pan plate pour jam exampl serv hot warm


In [3]:
# 3. Word2Vec modelini yükleyin (veya eğitin)
lemmatized_model = Word2Vec.load("lemmatized_model_cbow_window2_dim100.model")  # Lemmatized model
stemmed_model = Word2Vec.load("stemmed_model_cbow_window2_dim100.model")  # Stemmed model


In [4]:
# 4. Giriş metnindeki kelimelerin vektörlerini alalım
def get_average_word2vec(text, model):
    words = text.split()
    word_vectors = []
    
    for word in words:
        if word in model.wv:
            word_vectors.append(model.wv[word])  # Kelimenin vektörünü alıyoruz
    
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  # Eğer kelime yoksa sıfır vektörü döndür
    return np.mean(word_vectors, axis=0)  # Vektörlerin ortalamasını alıyoruz

# 5. Giriş metninin vektörünü alalım (lemmatized ve stemmed için)
sample_text_vector_lemmatized = get_average_word2vec(sample_text_lemmatized, lemmatized_model)
sample_text_vector_stemmed = get_average_word2vec(sample_text_stemmed, stemmed_model)


In [5]:
# 6. Lemmatized metinlerle benzerlik hesaplayalım
similarities_lemmatized = []
for i in range(len(lemmatized_df)):
    doc_vector = get_average_word2vec(lemmatized_df['content'].iloc[i], lemmatized_model)  # Her metnin vektörünü alıyoruz
    similarity = np.dot(sample_text_vector_lemmatized, doc_vector) / (np.linalg.norm(sample_text_vector_lemmatized) * np.linalg.norm(doc_vector))  # Cosine similarity hesaplıyoruz
    similarities_lemmatized.append((i, similarity))  # Sonuçları kaydediyoruz

# 7. Stemmed metinlerle benzerlik hesaplayalım
similarities_stemmed = []
for i in range(len(stemmed_df)):
    doc_vector = get_average_word2vec(stemmed_df['content'].iloc[i], stemmed_model)  # Her metnin vektörünü alıyoruz
    similarity = np.dot(sample_text_vector_stemmed, doc_vector) / (np.linalg.norm(sample_text_vector_stemmed) * np.linalg.norm(doc_vector))  # Cosine similarity hesaplıyoruz
    similarities_stemmed.append((i, similarity))  # Sonuçları kaydediyoruz

# 8. En yüksek 5 benzer metni sıralayalım (lemmatized ve stemmed)
similarities_lemmatized.sort(key=lambda x: x[1], reverse=True)  # Benzerlik skorlarına göre sıralıyoruz
similarities_stemmed.sort(key=lambda x: x[1], reverse=True)  # Benzerlik skorlarına göre sıralıyoruz

# 9. Sonuçları yazdıralım
print("Lemmatized ile en benzer 5 metin:")
for idx, similarity in similarities_lemmatized[:5]:
    print(f"doc{idx+1}: {lemmatized_df['content'].iloc[idx]} (Benzerlik Skoru: {similarity:.4f})")

print("\nStemmed ile en benzer 5 metin:")
for idx, similarity in similarities_stemmed[:5]:
    print(f"doc{idx+1}: {stemmed_df['content'].iloc[idx]} (Benzerlik Skoru: {similarity:.4f})")


Lemmatized ile en benzer 5 metin:
doc1: put cottage cheese wide bowl add egg sugar flour fork mix homogeneous mass turned sticky add flour half table small amount flour lay cottage two equal part roll sausage thickness sausage small identical piece sharp desired slightly add piece give rounded small pan bring water lower dumpling boiling water one stirring slightly slotted spoon dumpling come surface plus another finished dumpling pan plate pour jam example serve hot warm (Benzerlik Skoru: 1.0000)
doc11802: peel potato cut large boil salted water put half butter pan make mashed cut onion fry vegetable oil beautiful ruddy flour bowl mix sour cream soda mix sour cream soda flour mix constantly stirring pouring water knead dough medium dough well wrap minute film heap dough slightly struggled potato add half fried onion mix everything finished dough quarter roll sausage cm thick piece chop piece thickness cm keep rest dough piece dough non blank dough napkin dry middle circle put incomple