In [2]:
# Gerekli kütüphaneleri yükleyelim
import pandas as pd

# CSV dosyasını tekrar yükleyelim
stemmed_df = pd.read_csv('stemmed_data.csv')

# Dosyanın ilk 5 satırına bakalım
stemmed_df.head()


Unnamed: 0,text_stemmed_sentences,ingredient_stemmed_sentences,name_stemmed_sentences
0,put cottag chees wide bowl add egg sugar flour...,chicken egg piec soft cottag chees g wheat flo...,breakfast lazi
1,rins buckwheat pour cup boil water salt cover ...,buckwheat cereal cup chop parsley tast chop ci...,breek breakfast
2,grate carrot green appl middl zest juic halv c...,carrot piec appl piec orang piec raisin g hone...,childhood breakfast
3,mix egg piec loaf egg mixtur veget oil side,baton piec milk tablespoon chicken egg piec sa...,french crouton breakfast
4,boil egg chees tast,green salad bundl chicken egg piec tomato piec...,low breakfast


In [8]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# CSV dosyasını yükle
stemmed_data = pd.read_csv('stemmed_data.csv')

# Stemmer ve stopword listesi
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Ön işleme fonksiyonu
def preprocess_sentence(sentence):
    tokens = sentence.split()  # Boşluklara göre kelimeleri ayır
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return stemmed_tokens

# Her satırı işle
tokenized_corpus_stemmed = []

for sentence in stemmed_data['text_stemmed_sentences']:
    try:
        stemmed_tokens = preprocess_sentence(sentence)
        tokenized_corpus_stemmed.append(stemmed_tokens)
    except Exception as e:
        print(f"Hata oluştu: {e}")
        tokenized_corpus_stemmed.append([])

# İlk 5 sonucu yazdır
for i in range(5):
    print(f"Cümle {i+1} - Stemmed: {tokenized_corpus_stemmed[i]}")




Cümle 1 - Stemmed: ['put', 'cottag', 'chee', 'wide', 'bowl', 'add', 'egg', 'sugar', 'flour', 'fork', 'mix', 'homogen', 'mass', 'turn', 'sticki', 'add', 'flour', 'half', 'tabl', 'small', 'amount', 'flour', 'lay', 'cottag', 'two', 'equal', 'part', 'roll', 'sausag', 'thick', 'sausag', 'small', 'ident', 'piec', 'sharp', 'desir', 'slightli', 'add', 'piec', 'give', 'round', 'small', 'pan', 'bring', 'water', 'lower', 'dumpl', 'boil', 'water', 'one', 'stir', 'slightli', 'slot', 'spoon', 'dumpl', 'come', 'surfac', 'plu', 'anoth', 'finish', 'dumpl', 'pan', 'plate', 'pour', 'jam', 'exampl', 'serv', 'hot', 'warm']
Cümle 2 - Stemmed: ['rin', 'buckwheat', 'pour', 'cup', 'boil', 'water', 'salt', 'cover', 'cover', 'minut', 'buckwheat', 'eaten', 'lose', 'nutrit', 'better', 'requir', 'amount', 'wound', 'buckwheat', 'portion', 'plate', 'season', 'oliv', 'oil', 'soy', 'sauc', 'lemon', 'juic', 'chop', 'lemon', 'chop', 'green', 'chop', 'veget', 'bulgarian', 'pepper', 'carrot', 'pumpkin', 'radish', 'green', 

In [9]:
# CSV dosyasındaki ilk birkaç satırı kontrol edelim
print(stemmed_data.head())

                              text_stemmed_sentences  \
0  put cottag chees wide bowl add egg sugar flour...   
1  rins buckwheat pour cup boil water salt cover ...   
2  grate carrot green appl middl zest juic halv c...   
3        mix egg piec loaf egg mixtur veget oil side   
4                                boil egg chees tast   

                        ingredient_stemmed_sentences    name_stemmed_sentences  
0  chicken egg piec soft cottag chees g wheat flo...            breakfast lazi  
1  buckwheat cereal cup chop parsley tast chop ci...           breek breakfast  
2  carrot piec appl piec orang piec raisin g hone...       childhood breakfast  
3  baton piec milk tablespoon chicken egg piec sa...  french crouton breakfast  
4  green salad bundl chicken egg piec tomato piec...             low breakfast  


In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Stemmed edilmiş metinlerin listesi, her bir cümleyi tokenlerden tekrar metne çeviriyoruz
stemmed_texts = [' '.join(tokens) for tokens in tokenized_corpus_stemmed]

# TF-IDF vektörleştirici başlatıyoruz
vectorizer = TfidfVectorizer()

# TF-IDF matrisini oluşturuyoruz (sparse matrix)
tfidf_matrix = vectorizer.fit_transform(stemmed_texts)

# Sparse matrisi pandas DataFrame'e dönüştürme
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=vectorizer.get_feature_names_out())

# İlk birkaç satırı gösterelim (ilk 5 cümleyi)
print(tfidf_df.head())

   abandon  abc  abdomen  abdomin  abelu  abil  abkhaz  abkhazian  abl  abluk  \
0        0    0        0        0      0     0       0          0    0      0   
1        0    0        0        0      0     0       0          0    0      0   
2        0    0        0        0      0     0       0          0    0      0   
3        0    0        0        0      0     0       0          0    0      0   
4        0    0        0        0      0     0       0          0    0      0   

   ...  zukata  zuko  zvezda  zwill  zyren  ºc  ñora  λάχανο  ρύζι  ᵒs  
0  ...       0     0       0      0      0   0     0       0     0   0  
1  ...       0     0       0      0      0   0     0       0     0   0  
2  ...       0     0       0      0      0   0     0       0     0   0  
3  ...       0     0       0      0      0   0     0       0     0   0  
4  ...       0     0       0      0      0   0     0       0     0   0  

[5 rows x 11475 columns]


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Ön işlenmiş stemmed token listelerini tekrar metne çeviriyoruz
stemmed_texts = [' '.join(tokens) for tokens in tokenized_corpus_stemmed]

# İlk 3 stemlenmiş metni yazdıralım
stemmed_texts[:3]


['put cottag chee wide bowl add egg sugar flour fork mix homogen mass turn sticki add flour half tabl small amount flour lay cottag two equal part roll sausag thick sausag small ident piec sharp desir slightli add piec give round small pan bring water lower dumpl boil water one stir slightli slot spoon dumpl come surfac plu anoth finish dumpl pan plate pour jam exampl serv hot warm',
 'rin buckwheat pour cup boil water salt cover cover minut buckwheat eaten lose nutrit better requir amount wound buckwheat portion plate season oliv oil soy sauc lemon juic chop lemon chop green chop veget bulgarian pepper carrot pumpkin radish green cocktail',
 'grate carrot green appl middl zest juic halv cut second small appl carrot orang raisin nut favorit season juic honey add cinnamon']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Token listelerini tekrar düz metne çeviriyoruz (önceden yapılmadıysa)
stemmed_texts = [' '.join(tokens) for tokens in tokenized_corpus_stemmed]

# TF-IDF vektörizerı başlatıyoruz
vectorizer = TfidfVectorizer()

# TF-IDF matrisini oluşturuyoruz
tfidf_matrix = vectorizer.fit_transform(stemmed_texts)

# TF-IDF işleminde kullanılan tüm kelimelerin eşsiz bir listesini alıyoruz
feature_names = vectorizer.get_feature_names_out()

# TF-IDF matrisini pandas DataFrame'e çeviriyoruz
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# İlk birkaç satırı gösteriyoruz
print(tfidf_df.head())


   abandon  abc  abdomen  abdomin  abelu  abil  abkhaz  abkhazian  abl  abluk  \
0      0.0  0.0      0.0      0.0    0.0   0.0     0.0        0.0  0.0    0.0   
1      0.0  0.0      0.0      0.0    0.0   0.0     0.0        0.0  0.0    0.0   
2      0.0  0.0      0.0      0.0    0.0   0.0     0.0        0.0  0.0    0.0   
3      0.0  0.0      0.0      0.0    0.0   0.0     0.0        0.0  0.0    0.0   
4      0.0  0.0      0.0      0.0    0.0   0.0     0.0        0.0  0.0    0.0   

   ...  zukata  zuko  zvezda  zwill  zyren   ºc  ñora  λάχανο  ρύζι   ᵒs  
0  ...     0.0   0.0     0.0    0.0    0.0  0.0   0.0     0.0   0.0  0.0  
1  ...     0.0   0.0     0.0    0.0    0.0  0.0   0.0     0.0   0.0  0.0  
2  ...     0.0   0.0     0.0    0.0    0.0  0.0   0.0     0.0   0.0  0.0  
3  ...     0.0   0.0     0.0    0.0    0.0  0.0   0.0     0.0   0.0  0.0  
4  ...     0.0   0.0     0.0    0.0    0.0  0.0   0.0     0.0   0.0  0.0  

[5 rows x 11475 columns]


In [14]:
# İlk cümle için TF-IDF skorlarını alıyoruz
first_sentence_vector_stemmed = tfidf_df.iloc[0]

# Skorlara göre büyükten küçüğe sıralayıp ilk 5 kelimeyi alıyoruz
top_5_words_stemmed = first_sentence_vector_stemmed.sort_values(ascending=False).head(5)

# Sonuçları yazdırıyoruz
print("İlk cümlede en yüksek TF-IDF skoruna sahip 5 kelime (stemmed):")
print(top_5_words_stemmed)


İlk cümlede en yüksek TF-IDF skoruna sahip 5 kelime (stemmed):
dumpl     0.501643
sausag    0.250943
cottag    0.216644
plu       0.203906
flour     0.196647
Name: 0, dtype: float64


In [16]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

target_word = 'potato'

# Kelimenin indeksini güvenli şekilde bulalım
if target_word in feature_names:
    target_index = feature_names.tolist().index(target_word)

    # TF-IDF matrisinden kelimenin sütun vektörünü al (2D formatta)
    target_vector = tfidf_matrix[:, target_index].toarray()

    # Tüm kelimelerin TF-IDF vektörlerini alıyoruz (her sütun bir kelime)
    tfidf_vectors = tfidf_matrix.toarray()

    # Cosine similarity hesapla (kelime vektörü ile tüm kelimeler)
    similarities = cosine_similarity(target_vector.T, tfidf_vectors.T)

    # En yüksek 6 benzerliği al (kendisi dahil)
    similarities = similarities.flatten()
    top_5_indices = similarities.argsort()[-6:][::-1]

    # Sonuçları yazdır
    print(f"'{target_word}' kelimesine en yakın 5 kelime (cosine similarity):")
    for idx in top_5_indices:
        print(f"{feature_names[idx]}: {similarities[idx]:.4f}")

else:
    print(f"'{target_word}' kelimesi TF-IDF kelime listesinde bulunamadı.")


'potato' kelimesine en yakın 5 kelime (cosine similarity):
potato: 1.0000
mash: 0.3086
salt: 0.2579
boil: 0.2569
onion: 0.2541
cut: 0.2487


In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

corpus = [
    "Potatoes are healthy food.",
    "Tomatoes are a kind of fruit.",
    "Potatoes and tomatoes are plants."
]

# Tokenize ve stemleme
tokenized_corpus_stemmed = []
for doc in corpus:
    tokens = word_tokenize(doc.lower())
    stemmed_tokens = [ps.stem(token) for token in tokens if token.isalpha()]  # sadece alfabe karakterleri
    tokenized_corpus_stemmed.append(stemmed_tokens)

print(tokenized_corpus_stemmed)



[['potato', 'are', 'healthi', 'food'], ['tomato', 'are', 'a', 'kind', 'of', 'fruit'], ['potato', 'and', 'tomato', 'are', 'plant']]


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

stemmed_texts = [' '.join(tokens) for tokens in tokenized_corpus_stemmed]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(stemmed_texts)

feature_names = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

df_tfidf.to_csv('tfidf_matrix_stemmed.csv', index=False)

print("TF-IDF matrisi başarıyla 'tfidf_matrix_stemmed.csv' dosyasına kaydedildi!")


TF-IDF matrisi başarıyla 'tfidf_matrix_stemmed.csv' dosyasına kaydedildi!
