In [34]:
import nltk
import pandas as pd
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import csv

In [16]:
# CSV dosyasını yükleyin
df = pd.read_csv('food.csv', encoding='utf-8')
df.head() 

Unnamed: 0,name,text,ingredient
0,Breakfast for the lazy,"Put cottage cheese in a wide bowl, add an egg,...","Chicken egg: 1 piece, soft cottage cheese: 200..."
1,Breek breakfast,"Rinse buckwheat, pour 2 cups of boiling water,...","Buckwheat cereal: 1 cup, chopped parsley: to t..."
2,Childhood breakfast,Grate the carrots and green apple on the middl...,"Carrots: 1 piece, apple: 1 piece, oranges: 1 p..."
3,French croutons for breakfast,Mix the egg with milk.Salt.Dip the pieces of t...,"Baton: 3 pieces, milk: 2 tablespoons, chicken ..."
4,Low -calorie breakfast,Boil the egg boiled.Cut the cheese and tomatoe...,"Green salad: 0.1 bundles, chicken eggs: 1 piec..."


In [20]:
# Tüm sütunlarda cümlelere ayırma
df['name_sentences'] = df['name'].apply(sent_tokenize)
df['text_sentences'] = df['text'].apply(sent_tokenize)
df['ingredient_sentences'] = df['ingredient'].apply(sent_tokenize)

# İlk 3 satırı kontrol edelim
print(df[['name_sentences', 'text_sentences', 'ingredient_sentences']].head(3))

             name_sentences  \
0  [Breakfast for the lazy]   
1         [Breek breakfast]   
2     [Childhood breakfast]   

                                      text_sentences  \
0  [Put cottage cheese in a wide bowl, add an egg...   
1  [Rinse buckwheat, pour 2 cups of boiling water...   
2  [Grate the carrots and green apple on the midd...   

                                ingredient_sentences  
0  [Chicken egg: 1 piece, soft cottage cheese: 20...  
1  [Buckwheat cereal: 1 cup, chopped parsley: to ...  
2  [Carrots: 1 piece, apple: 1 piece, oranges: 1 ...  


In [37]:
# Stopwords ve noktalama temizliği için hazırlık
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)

# Metin temizleme fonksiyonu
def clean_text(text):
    try:
        tokens = word_tokenize(str(text).lower())
        tokens = [word.translate(table) for word in tokens if word.isalpha()]
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    except:
        return text  # hata durumunda orijinal metni döndür

# Temizlenecek sütunlar
target_columns = ['text', 'ingredient', 'name']

# Yalnızca bu sütunlara temizleme işlemini uygula
for col in target_columns:
    if col in df.columns and df[col].dtype == 'object':
        df[col] = df[col].apply(clean_text)

# İlk 5 satırı kontrol et
print(df[['name', 'text', 'ingredient']].head(5))


                        name  \
0             breakfast lazy   
1            breek breakfast   
2        childhood breakfast   
3  french croutons breakfast   
4              low breakfast   

                                                text  \
0  put cottage cheese wide bowl add egg sugar flo...   
1  rinse buckwheat pour cups boiling water salt c...   
2  grate carrots green apple middle zest juice ha...   
3  mix egg pieces loaf egg mixture vegetable oil ...   
4                              boil egg cheese taste   

                                          ingredient  
0  chicken egg piece soft cottage cheese g wheat ...  
1  buckwheat cereal cup chopped parsley taste cho...  
2  carrots piece apple piece oranges piece raisin...  
3  baton pieces milk tablespoons chicken egg piec...  
4  green salad bundles chicken eggs piece tomatoe...  


In [38]:
# Lemmatizer ve Stemmer'ı başlat
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Lemmatization fonksiyonu
def lemmatize_text(text):
    try:
        tokens = word_tokenize(str(text).lower())
        lemmatized = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha()]
        return ' '.join(lemmatized)
    except:
        return text

# Stemming fonksiyonu
def stem_text(text):
    try:
        tokens = word_tokenize(str(text).lower())
        stemmed = [stemmer.stem(word) for word in tokens if word.isalpha()]
        return ' '.join(stemmed)
    except:
        return text

# Hedef sütunlar
target_columns = ['text', 'ingredient', 'name']

# Her sütun için lemmatization ve stemming uygula
for col in target_columns:
    if col in df.columns and df[col].dtype == 'object':
        df[f'{col}_lemmatized'] = df[col].apply(lemmatize_text)
        df[f'{col}_stemmed'] = df[col].apply(stem_text)

# İlk 5 satırı yazdır
print(df[[f'{col}_lemmatized' for col in target_columns] +
         [f'{col}_stemmed' for col in target_columns]].head(5))

                                     text_lemmatized  \
0  put cottage cheese wide bowl add egg sugar flo...   
1  rinse buckwheat pour cup boiling water salt co...   
2  grate carrot green apple middle zest juice hal...   
3  mix egg piece loaf egg mixture vegetable oil side   
4                              boil egg cheese taste   

                               ingredient_lemmatized  \
0  chicken egg piece soft cottage cheese g wheat ...   
1  buckwheat cereal cup chopped parsley taste cho...   
2  carrot piece apple piece orange piece raisin g...   
3  baton piece milk tablespoon chicken egg piece ...   
4  green salad bundle chicken egg piece tomato pi...   

            name_lemmatized  \
0            breakfast lazy   
1           breek breakfast   
2       childhood breakfast   
3  french crouton breakfast   
4             low breakfast   

                                        text_stemmed  \
0  put cottag chees wide bowl add egg sugar flour...   
1  rins buckwheat pour cup 

In [41]:
# Hedef sütunlar
target_columns = ['name', 'text', 'ingredient']

for col in target_columns:
    if col in df.columns and df[col].dtype == 'object':
        # Her satıra fonksiyonu uygula, sonuç tuple (lemmatized, stemmed)
        # İlk eleman lemmatized token listesi, ikinci eleman stemmed token listesi
        df[f'{col}_lemmatized_tokens'] = df[col].apply(lambda x: preprocess_sentence(str(x))[0])
        df[f'{col}_stemmed_tokens'] = df[col].apply(lambda x: preprocess_sentence(str(x))[1])

# İlk 5 satırı gösterelim
print(df[[f'{col}_lemmatized_tokens' for col in target_columns] + [f'{col}_stemmed_tokens' for col in target_columns]].head(5))

         name_lemmatized_tokens  \
0             [breakfast, lazy]   
1            [breek, breakfast]   
2        [childhood, breakfast]   
3  [french, crouton, breakfast]   
4              [low, breakfast]   

                              text_lemmatized_tokens  \
0  [put, cottage, cheese, wide, bowl, add, egg, s...   
1  [rinse, buckwheat, pour, cup, boiling, water, ...   
2  [grate, carrot, green, apple, middle, zest, ju...   
3  [mix, egg, piece, loaf, egg, mixture, vegetabl...   
4                         [boil, egg, cheese, taste]   

                        ingredient_lemmatized_tokens  \
0  [chicken, egg, piece, soft, cottage, cheese, g...   
1  [buckwheat, cereal, cup, chopped, parsley, tas...   
2  [carrot, piece, apple, piece, orange, piece, r...   
3  [baton, piece, milk, tablespoon, chicken, egg,...   
4  [green, salad, bundle, chicken, egg, piece, to...   

            name_stemmed_tokens  \
0             [breakfast, lazi]   
1            [breek, breakfast]   
2        [

In [42]:
# Örnek metin listesini cümle cümle işleyecek fonksiyon
def preprocess_corpus(corpus):
    tokenized_corpus_lemmatized = []
    tokenized_corpus_stemmed = []

    for text in corpus:
        sentences = sent_tokenize(text)  # Metni cümlelere böl
        for sentence in sentences:
            tokens = word_tokenize(sentence)
            filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
            
            lemmatized = [lemmatizer.lemmatize(token) for token in filtered_tokens]
            stemmed = [stemmer.stem(token) for token in filtered_tokens]

            tokenized_corpus_lemmatized.append(lemmatized)
            tokenized_corpus_stemmed.append(stemmed)

    return tokenized_corpus_lemmatized, tokenized_corpus_stemmed

# Örnek kullanım:
texts = df['text'].dropna().tolist()  # Örnek olarak 'text' sütununu alıyoruz
lem, stem = preprocess_corpus(texts)

print(lem[:3])
print(stem[:3])

[['put', 'cottage', 'cheese', 'wide', 'bowl', 'add', 'egg', 'sugar', 'flour', 'fork', 'mix', 'homogeneous', 'mass', 'turned', 'sticky', 'add', 'flour', 'half', 'table', 'small', 'amount', 'flour', 'lay', 'cottage', 'two', 'equal', 'part', 'roll', 'sausage', 'thickness', 'sausage', 'small', 'identical', 'piece', 'sharp', 'desired', 'slightly', 'add', 'piece', 'give', 'rounded', 'small', 'pan', 'bring', 'water', 'lower', 'dumpling', 'boiling', 'water', 'one', 'stirring', 'slightly', 'slotted', 'spoon', 'dumpling', 'come', 'surface', 'plus', 'another', 'finished', 'dumpling', 'pan', 'plate', 'pour', 'jam', 'example', 'serve', 'hot', 'warm'], ['rinse', 'buckwheat', 'pour', 'cup', 'boiling', 'water', 'salt', 'cover', 'cover', 'minute', 'buckwheat', 'eaten', 'lose', 'nutritional', 'better', 'required', 'amount', 'wounded', 'buckwheat', 'portioned', 'plate', 'seasoned', 'olive', 'oil', 'soy', 'sauce', 'lemon', 'juice', 'chopped', 'lemon', 'chopped', 'green', 'chopped', 'vegetable', 'bulgarian

In [43]:
# Önce gerekli kütüphaneler ve fonksiyonun tanımı olmalı (daha önce verdiğin preprocess_sentence burada kullanılacak)

from nltk.tokenize import sent_tokenize

# Boş sütunlar için hazırlık
for col in target_columns:
    df[f'{col}_lemmatized_sentences'] = [[] for _ in range(len(df))]
    df[f'{col}_stemmed_sentences'] = [[] for _ in range(len(df))]

for col in target_columns:
    if col in df.columns and df[col].dtype == 'object':
        for idx, text in df[col].items():
            sentences = sent_tokenize(str(text))  # Satırdaki metni cümlelere ayır
            lemmatized_sentences = []
            stemmed_sentences = []
            
            for sentence in sentences:
                lemmatized_tokens, stemmed_tokens = preprocess_sentence(sentence)
                lemmatized_sentences.append(lemmatized_tokens)
                stemmed_sentences.append(stemmed_tokens)
            
            df.at[idx, f'{col}_lemmatized_sentences'] = lemmatized_sentences
            df.at[idx, f'{col}_stemmed_sentences'] = stemmed_sentences

# Örnek çıktı kontrolü
print(df[[f'{col}_lemmatized_sentences' for col in target_columns] + [f'{col}_stemmed_sentences' for col in target_columns]].head(3))


  name_lemmatized_sentences  \
0       [[breakfast, lazy]]   
1      [[breek, breakfast]]   
2  [[childhood, breakfast]]   

                           text_lemmatized_sentences  \
0  [[put, cottage, cheese, wide, bowl, add, egg, ...   
1  [[rinse, buckwheat, pour, cup, boiling, water,...   
2  [[grate, carrot, green, apple, middle, zest, j...   

                     ingredient_lemmatized_sentences  \
0  [[chicken, egg, piece, soft, cottage, cheese, ...   
1  [[buckwheat, cereal, cup, chopped, parsley, ta...   
2  [[carrot, piece, apple, piece, orange, piece, ...   

     name_stemmed_sentences  \
0       [[breakfast, lazi]]   
1      [[breek, breakfast]]   
2  [[childhood, breakfast]]   

                              text_stemmed_sentences  \
0  [[put, cottag, chees, wide, bowl, add, egg, su...   
1  [[rins, buckwheat, pour, cup, boil, water, sal...   
2  [[grate, carrot, green, appl, middl, zest, jui...   

                        ingredient_stemmed_sentences  
0  [[chicken, egg, p

In [48]:
import csv

target_columns = ['text_lemmatized_sentences', 'ingredient_lemmatized_sentences', 'name_lemmatized_sentences']

with open("lemmatized_sentences_combined.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    
    # Başlık satırı (isteğe bağlı)
    writer.writerow(target_columns)
    
    for idx in range(len(df)):
        row_data = []
        for col in target_columns:
            sentences = df.at[idx, col]  # Liste: cümle token listeleri
            # Her cümleyi token listesinden stringe çevir ve '||' ile cümleleri ayır
            sentences_str = ' || '.join([' '.join(tokens) for tokens in sentences])
            row_data.append(sentences_str)
        writer.writerow(row_data)



In [49]:
import csv

target_columns = ['text_stemmed_sentences', 'ingredient_stemmed_sentences', 'name_stemmed_sentences']

with open("stemmed_sentences_combined.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    
    # Başlık satırı (isteğe bağlı)
    writer.writerow(target_columns)
    
    for idx in range(len(df)):
        row_data = []
        for col in target_columns:
            sentences = df.at[idx, col]  # Liste: cümle token listeleri
            # Her cümleyi token listesinden stringe çevir ve '||' ile cümleleri ayır
            sentences_str = ' || '.join([' '.join(tokens) for tokens in sentences])
            row_data.append(sentences_str)
        writer.writerow(row_data)

