# Detailed EDA for Restaurant Cuisine Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('C:\\Users\\91767\\Downloads\\archive\\Cuisine_rating.csv')
print(df.head())
print(df.isnull().sum())
df_clean = df.dropna()
print(df.dtypes)
print(df.describe())

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df_clean, x='Cuisine_Type')
plt.xticks(rotation=90)
plt.title('Distribution of Cuisines')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df_clean['Rating'], bins=20, kde=True)
plt.title('Distribution of Ratings')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(df_clean['Rating'])
plt.title('Boxplot of Ratings')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df_clean['Votes'], bins=20, kde=True)
plt.title('Distribution of Votes')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Votes', y='Rating', data=df_clean)
plt.title('Ratings vs Votes')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Cuisine_Type', y='Rating', data=df_clean)
plt.xticks(rotation=90)
plt.title('Cuisine Type vs Rating')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
corr = df_clean.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
cuisine_1 = df_clean[df_clean['Cuisine_Type'] == 'Italian']['Rating']
cuisine_2 = df_clean[df_clean['Cuisine_Type'] == 'Indian']['Rating']
t_stat, p_val = stats.ttest_ind(cuisine_1, cuisine_2)
print(f'T-test statistic: {t_stat}, p-value: {p_val}')

In [None]:
df_clean['Rating_Category'] = pd.cut(df_clean['Rating'], bins=[0, 3, 4, 5], labels=['Low', 'Medium', 'High'])
contingency_table = pd.crosstab(df_clean['Cuisine_Type'], df_clean['Rating_Category'])
chi2_stat, p_val, dof, expected = stats.chi2_contingency(contingency_table)
print(f'Chi-Square statistic: {chi2_stat}, p-value: {p_val}')

In [None]:
anova_stat, p_val = stats.f_oneway(df_clean[df_clean['Cuisine_Type'] == 'Italian']['Rating'],
                                   df_clean[df_clean['Cuisine_Type'] == 'Indian']['Rating'],
                                   df_clean[df_clean['Cuisine_Type'] == 'Chinese']['Rating'])
print(f'ANOVA statistic: {anova_stat}, p-value: {p_val}')

In [None]:
sns.pairplot(df_clean)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(df_clean['Rating'])
plt.title('Boxplot of Ratings')
plt.show()

In [None]:
q1 = df_clean['Rating'].quantile(0.25)
q3 = df_clean['Rating'].quantile(0.75)
iqr = q3 - q1
df_clean_no_outliers = df_clean[(df_clean['Rating'] >= q1 - 1.5*iqr) & (df_clean['Rating'] <= q3 + 1.5*iqr)]
print(df_clean_no_outliers.shape)

In [None]:
corr = df_clean.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
le = LabelEncoder()
df_clean['Cuisine_Type_encoded'] = le.fit_transform(df_clean['Cuisine_Type'])

In [None]:
mean_diff = cuisine_1.mean() - cuisine_2.mean()
std_error = np.sqrt(cuisine_1.var()/len(cuisine_1) + cuisine_2.var()/len(cuisine_2))
z_score = mean_diff / std_error
p_val_z = stats.norm.sf(abs(z_score)) * 2
print(f'Z-test statistic: {z_score}, p-value: {p_val_z}')