# EDA

### Read Cookie Data

In [1]:
import pandas as pd
import numpy as np
import requests
import io
import altair as alt

In [2]:
def read_data() -> pd.DataFrame:
    """
    Reads data from a predefined URL into a Pandas DataFrame.

    Returns:
    --------
    pd.DataFrame
        A DataFrame containing the data from the specified URL.

    Raises:
    -------
    ValueError:
        If the data cannot be retrieved or parsed.

    Example:
    --------
    >>> df = read_data()
    >>> print(df.head())
    """
    url = "https://raw.githubusercontent.com/the-pudding/data/master/cookies/choc_chip_cookie_ingredients.csv"

    try:
        response = requests.get(url)
        response.raise_for_status()
        df = pd.read_csv(io.StringIO(response.text), index_col=0)        
        return df
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

In [3]:
df = read_data()
df.head()

Unnamed: 0,Ingredient,Text,Recipe_Index,Rating,Quantity,Unit
1,all purpose flour,3.0 cups all purpose flour,AR_1,0.920725,3.0,cup
2,all purpose flour,2.8000000000000003 cups all purpose flour,AR_10,0.905162,2.8,cup
3,all purpose flour,1.1076923076923078 cups all purpose flour,AR_101,0.6,1.107692,cup
4,all purpose flour,3.333333333333333 cups sifted all purpose flour,AR_102,0.9375,3.333333,cup
5,all purpose flour,2.0 cups all purpose flour,AR_103,0.88125,2.0,cup


In [4]:
# Check the shape of the dataset (rows, columns)
print(f"Dataset Shape: {df.shape}")

# Check column names and data types
print("\nColumn Information:")
print(df.info())

# Check summary statistics of numerical columns
print("\nSummary Statistics:")
print(df.describe())

Dataset Shape: (1990, 6)

Column Information:
<class 'pandas.core.frame.DataFrame'>
Index: 1990 entries, 1 to 1301
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Ingredient    1990 non-null   object 
 1   Text          1990 non-null   object 
 2   Recipe_Index  1990 non-null   object 
 3   Rating        980 non-null    float64
 4   Quantity      1990 non-null   float64
 5   Unit          1990 non-null   object 
dtypes: float64(2), object(4)
memory usage: 108.8+ KB
None

Summary Statistics:
           Rating     Quantity
count  980.000000  1990.000000
mean     0.814986     2.234415
std      0.135632     2.529849
min      0.375000     0.000000
25%      0.750000     1.000000
50%      0.870253     1.639632
75%      0.909016     2.666667
max      1.000000    48.000000


In [5]:
# Count missing values per column
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Ingredient         0
Text               0
Recipe_Index       0
Rating          1010
Quantity           0
Unit               0
dtype: int64


In [6]:
# Check unique ingredient names
print("\nUnique Ingredients:", df["Ingredient"].nunique())
print(df["Ingredient"].value_counts().head(10))

print("\nUnique Recipes:", df["Recipe_Index"].nunique())
print(df["Recipe_Index"].value_counts().head(10))


Unique Ingredients: 68
Ingredient
egg                         212
vanilla                     196
all purpose flour           193
baking soda                 187
sugar                       175
light brown sugar           170
salt                        168
butter                      160
semisweet chocolate chip    159
baking powder                50
Name: count, dtype: int64

Unique Recipes: 209
Recipe_Index
AR_101      16
AR_4        15
AR_39       14
AR_96       14
AR_19       13
Misc_30     13
Misc_63     13
Misc_109    13
AR_110      13
AR_91       13
Name: count, dtype: int64


In [7]:
top_ingredients = df["Ingredient"].value_counts().reset_index()
top_ingredients.columns = ["Ingredient", "Count"]
top_10 = top_ingredients.head(10)
top_10

Unnamed: 0,Ingredient,Count
0,egg,212
1,vanilla,196
2,all purpose flour,193
3,baking soda,187
4,sugar,175
5,light brown sugar,170
6,salt,168
7,butter,160
8,semisweet chocolate chip,159
9,baking powder,50


In [8]:
chart = (
    alt.Chart(top_10)
    .mark_bar(color="skyblue")
    .encode(
        x=alt.X("Ingredient:N", sort="-y", title="Ingredient"),
        y=alt.Y("Count:Q", title="Frequency"),
        tooltip=["Ingredient", "Count"]
    )
    .properties(title="Top 10 Most Common Ingredients", width=600)
)

chart.show()

In [9]:
hist = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        alt.X("Quantity:Q", bin=True, title="Quantity"),
        alt.Y("count()", title="Frequency"),
        tooltip=["Quantity"]
    )
    .properties(title="Distribution of Ingredient Quantities", width=600)
)

hist.show()


In [10]:
# Altair Histogram for Ratings
rating_hist = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        alt.X("Rating:Q", bin=alt.Bin(maxbins=20), title="Rating"),
        alt.Y("count()", title="Count"),
        tooltip=["Rating"]
    )
    .properties(title="Distribution of Recipe Ratings", width=600)
)

rating_hist.show()

In [11]:
# Altair Heatmap for Ingredient vs. Recipe
heatmap = (
    alt.Chart(df)
    .mark_rect()
    .encode(
        x=alt.X("Recipe_Index:N", title="Recipe"),
        y=alt.Y("Ingredient:N", title="Ingredient"),
        color=alt.Color("count()", scale=alt.Scale(scheme="blues")),
        tooltip=["Recipe_Index", "Ingredient"]
    )
    .properties(title="Ingredient Usage Across Recipes", width=700, height=500)
)

heatmap.show()


In [12]:
correlation_matrix = df[["Quantity", "Rating"]].corr().reset_index().melt("index")

correlation_matrix.columns = ["Variable1", "Variable2", "Correlation"]

correlation_chart = (
    alt.Chart(correlation_matrix)
    .mark_rect()
    .encode(
        x=alt.X("Variable1:N", title=""),
        y=alt.Y("Variable2:N", title=""),
        color=alt.Color("Correlation:Q", scale=alt.Scale(scheme="redblue")),
        tooltip=["Variable1", "Variable2", "Correlation"]
    )
    .properties(title="Correlation Matrix", width=400, height=400)
)

correlation_chart.show()
 