In [None]:
    import pandas as pd
    import os
    import zipfile
    from kaggle.api.kaggle_api_extended import KaggleApi
    import ast
    from sklearn.preprocessing import MultiLabelBinarizer
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.multiclass import OneVsRestClassifier
    from scipy.sparse import csr_matrix
    import numpy as np
    import joblib 
    from sklearn.preprocessing import LabelEncoder

In [7]:
api=KaggleApi()
api.authenticate()

api.dataset_download_file(
    'paultimothymooney/recipenlg',
    file_name='RecipeNLG_dataset.csv',
    path='../data'
)

original_path = '../data/RecipeNLG_dataset.csv'
zip_path = '../data/RecipeNLG_dataset.zip'
os.rename(original_path, zip_path)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('../data')
os.remove(zip_path)

input_path='../data/RecipeNLG_dataset.csv'
output_path='../data/train-data.csv'

Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/recipenlg


In [8]:
print(f'Reading dataset from {input_path}')
data = pd.read_csv(input_path)
print(f'Dataset imported. Shape: {data.shape}')

Reading dataset from ../data/RecipeNLG_dataset.csv
Dataset imported. Shape: (2231142, 7)


In [9]:
print(f'Cleaning dataset...')
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
columns_to_drop = ["Unnamed: 0", "ingredients", "directions", "link", "source"]
data.drop(columns=columns_to_drop, inplace=True)
data.rename(columns={"NER": "ingredients"}, inplace=True)
print(f'Shape: {data.shape}')

Cleaning dataset...
Shape: (2231141, 2)


In [10]:
data = data.head(10000)
print(data)

                         title  \
0          No-Bake Nut Cookies   
1        Jewell Ball'S Chicken   
2                  Creamy Corn   
3                Chicken Funny   
4         Reeses Cups(Candy)     
...                        ...   
9995          Pink Fruit Salad   
9996            Peppered Steak   
9997         Chicken Casserole   
9998  Sweet Potatoes Casserole   
9999             7 Layer Salad   

                                            ingredients  
0     ["brown sugar", "milk", "vanilla", "nuts", "bu...  
1     ["beef", "chicken breasts", "cream of mushroom...  
2     ["frozen corn", "cream cheese", "butter", "gar...  
3     ["chicken", "chicken gravy", "cream of mushroo...  
4     ["peanut butter", "graham cracker crumbs", "bu...  
...                                                 ...  
9995  ["cherry pie filling", "condensed milk", "pine...  
9996  ["bell pepper", "onion", "tomatoes", "salt", "...  
9997  ["fryer", "onion", "green pepper", "celery", "...  
9998  ["yam

In [None]:
print('Modifing dataset...')
data['ingredients'] = data['ingredients'].apply(ast.literal_eval)

mlb = MultiLabelBinarizer(sparse_output=True)
ingredient_matrix = mlb.fit_transform(data['ingredients'])

ingredient_df = pd.DataFrame.sparse.from_spmatrix(
    ingredient_matrix, 
    index=data.index, 
    columns=mlb.classes_
)

data = pd.concat([data, ingredient_df], axis=1)
data.drop(columns=['ingredients'], inplace=True)
print(f'Final shape: {data.shape}')

print('Exporting new dataset...')
data.to_csv(output_path)
print(f'New dataset exported to {output_path}')


Modifing dataset...
Final shape: (10000, 4182)
Exporting new dataset...
New dataset exported to ../data/train-data.csv


In [None]:
# wybor najpopularniejszych przepisow
top_recipes = data['title'].value_counts().nlargest(100).index
data = data[data['title'].isin(top_recipes)]
print(f"Liczba unikalnych tytułów przepisów: {data['title'].nunique()}")

# kodowanie title na liczby
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['title'])

# X - kolumny składników, y - zakodowane etykiety przepisow
ingredient_columns = data.columns.difference(['title', 'label'])
X = data[ingredient_columns]
y = data['label']

# podział na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# trenowanie prostego modelu
model = LogisticRegression(max_iter=100)
classifier = OneVsRestClassifier(model)
classifier.fit(X_train, y_train)

In [None]:
# zapis modelu
joblib.dump(classifier, 'recipe_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [None]:
# ewaluacja:
from sklearn.metrics import accuracy_score, classification_report

y_pred = classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))