In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

# Veri setleri
lemmatized_df = pd.read_csv('lemmatized_for_similarity.csv')
stemmed_df = pd.read_csv('stemmed_for_similarity.csv')

sample_text_lemmatized = lemmatized_df['content'].iloc[0]
sample_text_stemmed = stemmed_df['content'].iloc[0]

def get_average_word2vec(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

model_paths = [
    # Stemmed modeller
    'stemmed_model_cbow_window2_dim100.model',
    'stemmed_model_cbow_window2_dim300.model',
    'stemmed_model_cbow_window4_dim100.model',
    'stemmed_model_cbow_window4_dim300.model',
    'stemmed_model_skipgram_window2_dim100.model',
    'stemmed_model_skipgram_window2_dim300.model',
    'stemmed_model_skipgram_window4_dim100.model',
    'stemmed_model_skipgram_window4_dim300.model',
    # Lemmatized modeller
    'lemmatized_model_cbow_window2_dim100.model',
    'lemmatized_model_cbow_window2_dim300.model',
    'lemmatized_model_cbow_window4_dim100.model',
    'lemmatized_model_cbow_window4_dim300.model',
    'lemmatized_model_skipgram_window2_dim100.model',
    'lemmatized_model_skipgram_window2_dim300.model',
    'lemmatized_model_skipgram_window4_dim100.model',
    'lemmatized_model_skipgram_window4_dim300.model'
]

results = []

for model_path in model_paths:
    model = Word2Vec.load(model_path)

    if model_path.startswith('lemmatized'):
        df = lemmatized_df
        sample_text = sample_text_lemmatized
    else:
        df = stemmed_df
        sample_text = sample_text_stemmed

    sample_vec = get_average_word2vec(sample_text, model)

    similarities = []
    for idx, sentence in enumerate(df['content']):
        sent_vec = get_average_word2vec(sentence, model)
        if np.linalg.norm(sample_vec) == 0 or np.linalg.norm(sent_vec) == 0:
            sim = 0.0
        else:
            sim = np.dot(sample_vec, sent_vec) / (np.linalg.norm(sample_vec) * np.linalg.norm(sent_vec))
        similarities.append((idx, sim))

    similarities.sort(key=lambda x: x[1], reverse=True)
    top_5 = similarities[:5]

    for idx, sim in top_5:
        results.append({
            'Model': model_path,
            'Document_ID': f'doc{idx+1}',
            'Similarity_Score': round(sim, 4),
            'Text_Snippet': df['content'].iloc[idx][:100]  # İlk 100 karakterle sınırla
        })

# DataFrame oluştur ve CSV'ye kaydet
results_df = pd.DataFrame(results)
results_df.to_csv('benzerlik_sonuclari_tum_modeller.csv', index=False)

print("Tüm modeller için benzerlik sonuçları 'benzerlik_sonuclari_tum_modeller.csv' dosyasına kaydedildi.")


Tüm modeller için benzerlik sonuçları 'benzerlik_sonuclari_tum_modeller.csv' dosyasına kaydedildi.
