In [3]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
uploaded = files.upload()

# Load the dataset
df = pd.read_csv("indian_food (2).csv")

# Display the first few rows to check the structure
print("First few rows of the DataFrame:")
print(df.head())

# Check data types and for missing values
print("\nData types and missing values:")
print(df.info())

# Strip any whitespace from column names
df.columns = df.columns.str.strip()

# Check for the presence of the 'page_views' column
if 'page_views' in df.columns:
    # Remove non-numeric characters (like commas) from 'page_views' and 'price'
    df['page_views'] = df['page_views'].str.replace(',', '', regex=True)
    df['price'] = df['price'].str.replace(',', '', regex=True)

    # Convert 'page_views' and 'price' to numeric, coercing errors to NaN
    df['page_views'] = pd.to_numeric(df['page_views'], errors='coerce')
    df['price'] = pd.to_numeric(df['price'], errors='coerce')

    # Check the conversion results
    print("\n'page_views' and 'price' after conversion:")
    print(df[['page_views', 'price']].head())

    # Drop rows with NaN values in the relevant columns
    df.dropna(subset=['page_views', 'price', 'gender'], inplace=True)

    # Univariate Analysis
    plt.figure(figsize=(10, 6))
    sns.histplot(df['page_views'], bins=30, kde=True)
    plt.title('Distribution of Page Views')
    plt.xlabel('Page Views')
    plt.ylabel('Frequency')
    plt.show()

    # Bivariate Analysis
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='page_views', y='price', data=df, hue='gender', palette='coolwarm')
    plt.title('Scatter Plot of Page Views vs Price')
    plt.xlabel('Page Views')
    plt.ylabel('Price')
    plt.legend(title='Gender')
    plt.show()

    # Multivariate Analysis using Pairplot
    sns.pairplot(df[['page_views', 'price', 'gender']], hue='gender', palette='coolwarm')
    plt.suptitle('Pairplot of Page Views, Price, and Gender', y=1.02)
    plt.show()

    # Correlation Heatmap
    correlation_matrix = df[['page_views', 'price']].corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap between Page Views and Price')
    plt.show()

else:
    print("'page_views' column not found in the DataFrame.")


Saving indian_food (2).csv to indian_food (2) (1).csv
First few rows of the DataFrame:
             name                                        ingredients  \
0      Balu shahi                    Maida flour, yogurt, oil, sugar   
1          Boondi                            Gram flour, ghee, sugar   
2  Gajar ka halwa       Carrots, milk, sugar, ghee, cashews, raisins   
3          Ghevar  Flour, ghee, kewra, milk, clarified butter, su...   
4     Gulab jamun  Milk powder, plain flour, baking powder, ghee,...   

         diet  prep_time  cook_time flavor_profile   course        state  \
0  vegetarian         45         25          sweet  dessert  West Bengal   
1  vegetarian         80         30          sweet  dessert    Rajasthan   
2  vegetarian         15         60          sweet  dessert       Punjab   
3  vegetarian         15         30          sweet  dessert    Rajasthan   
4  vegetarian         15         40          sweet  dessert  West Bengal   

  region  
0   East  
1