In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
nehaprabhavalkar_indian_food_101_path = kagglehub.dataset_download('nehaprabhavalkar/indian-food-101')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as  pd
df=pd.read_csv("/kaggle/input/indian-food-101/indian_food.csv")
df.head(10)

In [None]:
df.info()

In [None]:
print(df.describe())

In [None]:
df.tail()

In [None]:
df.duplicated()

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

In [None]:
df[df['region'] =='null']

In [None]:
df.select_dtypes(include='number')

In [None]:
df[df['prep_time']==-1]

In [None]:
df.replace(-1, pd.NA, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

df['ingredients'] = df['ingredients'].fillna('')
df['num_ingredients'] = df['ingredients'].apply(lambda x: len(x.split(',')))

le = LabelEncoder()
for col in ['flavor_profile', 'course', 'region', 'diet']:
    df[col] = le.fit_transform(df[col].astype(str))


In [None]:
train_prep = df[df['prep_time'].notna()]
test_prep = df[df['prep_time'].isna()]

train_cook = df[df['cook_time'].notna()]
test_cook = df[df['cook_time'].isna()]


In [None]:
from sklearn.ensemble import RandomForestRegressor

features = ['course', 'region', 'flavor_profile', 'diet', 'num_ingredients']

X_train = train_prep[features]
y_train = train_prep['prep_time']

model_prep = RandomForestRegressor()
model_prep.fit(X_train, y_train)

# Predict missing values
X_test = test_prep[features]
df.loc[df['prep_time'].isna(), 'prep_time'] = model_prep.predict(X_test).round()


In [None]:
X_train = train_cook[features]
y_train = train_cook['cook_time']

model_cook = RandomForestRegressor()
model_cook.fit(X_train, y_train)

X_test = test_cook[features]
df.loc[df['cook_time'].isna(), 'cook_time'] = model_cook.predict(X_test).round()


In [None]:
df['prep_time'] = df['prep_time'].astype(int)
df['cook_time'] = df['cook_time'].astype(int)
df.to_csv("indian_food_filled.csv", index=False)


In [None]:
df.describe()
df.info()
df['course'].value_counts()
df['region'].value_counts()
df['diet'].value_counts()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(df['prep_time'], kde=True)
plt.title("Prep Time Distribution")
plt.show()

sns.histplot(df['cook_time'], kde=True)
plt.title("Cook Time Distribution")
plt.show()


In [None]:
df.groupby('course')[['prep_time', 'cook_time']].mean().sort_values('cook_time', ascending=False)


In [None]:
df.groupby('region')[['prep_time', 'cook_time']].mean().sort_values('prep_time', ascending=False)


In [None]:
df.groupby('diet')[['prep_time', 'cook_time']].mean()


In [None]:
sns.heatmap(df[['prep_time', 'cook_time', 'num_ingredients']].corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation")
plt.show()


In [None]:
print(df[df['prep_time'] > 100][['name', 'prep_time', 'course']])
df[df['cook_time'] > 150][['name', 'cook_time', 'course']]


In [None]:
from collections import Counter

ingredient_list = df['ingredients'].dropna().str.lower().str.split(',')
all_ingredients = [item.strip() for sublist in ingredient_list for item in sublist]
ingredient_counts = Counter(all_ingredients)
ingredient_counts.most_common(15)


In [None]:
df.groupby('region')['name'].agg(lambda x: x.value_counts().head(1))
