In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

# 1. Lemmatized veri setini yükle
lemmatized_df = pd.read_csv('lemmatized_for_similarity.csv')  # Dosya yolunu kendine göre ayarla

# 2. Giriş metnini seç (örneğin ilk satır)
sample_text = lemmatized_df['content'].iloc[0]

# 3. Word2Vec modelini yükle (dosya yolunu kendine göre ayarla)
model = Word2Vec.load('lemmatized_model_cbow_window2_dim100.model')

# 4. Cümlenin ortalama kelime vektörünü hesaplayan fonksiyon
def get_average_word2vec(text, model):
    words = text.split()
    vectors = []
    for word in words:
        if word in model.wv:
            vectors.append(model.wv[word])
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# 5. Giriş metninin vektörünü al
sample_vec = get_average_word2vec(sample_text, model)

# 6. Veri setindeki tüm cümlelerin vektörünü al ve cosine similarity hesapla
similarities = []
for idx, sentence in enumerate(lemmatized_df['content']):
    sent_vec = get_average_word2vec(sentence, model)
    
    # Cosine similarity hesapla
    if np.linalg.norm(sample_vec) == 0 or np.linalg.norm(sent_vec) == 0:
        sim = 0.0
    else:
        sim = np.dot(sample_vec, sent_vec) / (np.linalg.norm(sample_vec) * np.linalg.norm(sent_vec))
    
    similarities.append((idx, sim))

# 7. Benzerliklere göre sırala ve en yüksek 5 sonucu yazdır
similarities.sort(key=lambda x: x[1], reverse=True)

print("Word2Vec ile en benzer 5 cümle:")
for idx, sim in similarities[:5]:
    print(f"Index: {idx}, Benzerlik Skoru: {sim:.4f}")
    print(f"Cümle: {lemmatized_df['content'].iloc[idx]}")
    print('-' * 50)



Word2Vec ile en benzer 5 cümle:
Index: 0, Benzerlik Skoru: 1.0000
Cümle: put cottage cheese wide bowl add egg sugar flour fork mix homogeneous mass turned sticky add flour half table small amount flour lay cottage two equal part roll sausage thickness sausage small identical piece sharp desired slightly add piece give rounded small pan bring water lower dumpling boiling water one stirring slightly slotted spoon dumpling come surface plus another finished dumpling pan plate pour jam example serve hot warm
--------------------------------------------------
Index: 11801, Benzerlik Skoru: 0.9433
Cümle: peel potato cut large boil salted water put half butter pan make mashed cut onion fry vegetable oil beautiful ruddy flour bowl mix sour cream soda mix sour cream soda flour mix constantly stirring pouring water knead dough medium dough well wrap minute film heap dough slightly struggled potato add half fried onion mix everything finished dough quarter roll sausage cm thick piece chop piece t

In [5]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

# 1. Veri setlerini yükle
lemmatized_df = pd.read_csv('lemmatized_for_similarity.csv')
stemmed_df = pd.read_csv('stemmed_for_similarity.csv')

# 2. Giriş metinlerini seç
sample_text_lemmatized = lemmatized_df['content'].iloc[0]
sample_text_stemmed = stemmed_df['content'].iloc[0]

# 3. Ortalama kelime vektörü fonksiyonu
def get_average_word2vec(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# 4. Model dosya yolları (aynı klasörde olduklarını varsayıyorum)
model_paths = [
    # Stemmed modeller
    'stemmed_model_cbow_window2_dim100.model',
    'stemmed_model_cbow_window2_dim300.model',
    'stemmed_model_cbow_window4_dim100.model',
    'stemmed_model_cbow_window4_dim300.model',
    'stemmed_model_skipgram_window2_dim100.model',
    'stemmed_model_skipgram_window2_dim300.model',
    'stemmed_model_skipgram_window4_dim100.model',
    'stemmed_model_skipgram_window4_dim300.model',
    # Lemmatized modeller
    'lemmatized_model_cbow_window2_dim100.model',
    'lemmatized_model_cbow_window2_dim300.model',
    'lemmatized_model_cbow_window4_dim100.model',
    'lemmatized_model_cbow_window4_dim300.model',
    'lemmatized_model_skipgram_window2_dim100.model',
    'lemmatized_model_skipgram_window2_dim300.model',
    'lemmatized_model_skipgram_window4_dim100.model',
    'lemmatized_model_skipgram_window4_dim300.model'
]

# 5. Modelleri teker teker işle
for model_path in model_paths:
    print(f"\n{'='*50}\nModel: {model_path}\n{'='*50}")

    model = Word2Vec.load(model_path)

    # Hangi veri seti ve giriş metni kullanılacak onu belirle
    if model_path.startswith('lemmatized'):
        df = lemmatized_df
        sample_text = sample_text_lemmatized
    else:
        df = stemmed_df
        sample_text = sample_text_stemmed

    sample_vec = get_average_word2vec(sample_text, model)

    similarities = []
    for idx, sentence in enumerate(df['content']):
        vec = get_average_word2vec(sentence, model)
        if np.linalg.norm(sample_vec) == 0 or np.linalg.norm(vec) == 0:
            sim = 0.0
        else:
            sim = np.dot(sample_vec, vec) / (np.linalg.norm(sample_vec) * np.linalg.norm(vec))
        similarities.append((idx, sim))

    similarities.sort(key=lambda x: x[1], reverse=True)

    print("En benzer 5 metin:")
    for idx, sim in similarities[:5]:
        print(f"Index: {idx}, Skor: {sim:.4f}")
        print(f"Cümle: {df['content'].iloc[idx]}\n")



Model: stemmed_model_cbow_window2_dim100.model
En benzer 5 metin:
Index: 0, Skor: 1.0000
Cümle: put cottag chees wide bowl add egg sugar flour fork mix homogen mass turn sticki add flour half tabl small amount flour lay cottag two equal part roll sausag thick sausag small ident piec sharp desir slightli add piec give round small pan bring water lower dumpl boil water one stir slightli slot spoon dumpl come surfac plu anoth finish dumpl pan plate pour jam exampl serv hot warm

Index: 4037, Skor: 0.9413
Cümle: sift flour middl mound make recess drive egg mix everyth cold milk water add stir salt sugar pour steep dough leav minut chop cabbag pour boil water let stand minut drain squeez cabbag add salt egg butter margarin stir dough dumpl sever part roll one har cm thick cut small piec slightli take cut side roll mm thick thick minc meat middl tortilla fold half pinch edg make also cut circl glass round stitch boot flour cool form serv dumpl lower boil salt water kg dumpl liter water brin