In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# read csv file
df = pd.read_csv(r'/kaggle/input/online-food-dataset/onlinefoods.csv')
df.head()

# Preprocessing

In [None]:
# check null values
df.isnull().sum() 

In [None]:
df.describe()

# Visualization

In [None]:
# Distribution of Age

plt.figure(figsize=(8,4))
sns.histplot(df['Age'], kde=True, color='skyblue')
plt.title('Distribution of Age')

In [None]:
# Family size distribution
plt.figure(figsize=(8,4))
sns.histplot(df['Family size'], kde=True, color='lightgreen')
plt.title('Distribution of Family Size')

plt.tight_layout()
plt.show()

In [None]:
# Gender distribution
plt.figure(figsize=(8,4))
sns.countplot(x='Gender', data=df, palette='pastel')
plt.title('Gender Distribution')

In [None]:
# Geographical distribution
plt.figure(figsize=(10, 6))

sns.scatterplot(x='longitude', y='latitude', data=df, hue='Output', palette='viridis')
plt.title('Geographical Distribution')
plt.show()

In [None]:
# feedback distribution
sns.countplot(x='Feedback', data=df, palette='viridis')
plt.title('Feedback Distribution')

# Feature Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Gender'] = le.fit_transform(df['Gender'])
df['Marital Status'] = le.fit_transform(df['Marital Status'])
df['Occupation'] = le.fit_transform(df['Occupation'])
df['Monthly Income'] = le.fit_transform(df['Monthly Income'])
df['Educational Qualifications'] = le.fit_transform(df['Educational Qualifications'])
df['Output'] = le.fit_transform(df['Output'])
df['Feedback'] = le.fit_transform(df['Feedback'])
df['Unnamed: 12'] = le.fit_transform(df['Unnamed: 12'])

# Model Training

In [None]:
# import packages
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [None]:
x = df.drop(['Output', 'Unnamed: 12'], axis=1) # train
y = df['Output'] # test

# train test split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2, random_state=0)

In [None]:
models = {'Random Forest': RandomForestClassifier(random_state=42),
          'Decision Tree': DecisionTreeClassifier()}

for name, model in models.items():
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    print(f'Model: {name}')
    print('Report:\n', classification_report(ytest, ypred))

In [None]:
# confusion matrix for decision trees
print(confusion_matrix(ytest, ypred)) 

# classification report for decision trees
print(classification_report(ytest, ypred))

In [None]:
plt.figure(figsize=(10, 6))

sns.boxplot(x='Occupation', y='Monthly Income', data=df, palette='Set2')
plt.title('Occupation vs. Monthly Income')
plt.xticks(rotation=45)
plt.show()

In [None]:
df['Occupation'].value_counts()