<a href="https://colab.research.google.com/github/ReemFarah/ReemFarah.github.io/blob/main/CC9_Analytics_charts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import altair as alt

# Load datasets
prices_df = pd.read_csv('https://eco-prices-scrapes.s3.eu-west-2.amazonaws.com/teaching/redacted_prices_df.csv')
items_df = pd.read_csv('https://eco-prices-scrapes.s3.eu-west-2.amazonaws.com/teaching/redacted_items_df.csv')

# Preview datasets to confirm the structure
print(prices_df.head())
print(items_df.head())


         date  price      unit_price  loyalty_price  original_price  store_id  \
0  2023-10-06  12.95  0.16 per 100ml            NaN             NaN         5   
1  2023-10-06   9.00      9 per 75cl            NaN             NaN         5   
2  2023-10-06   4.00   1.29 per 100g            NaN             NaN         5   
3  2023-10-06   2.50   1.67 per 100g           2.00             NaN         5   
4  2023-10-06   1.50   1.36 per 100g           1.35             NaN         5   

   product_id  
0    209870.0  
1    265800.0  
2    181052.0  
3    122275.0  
4    164794.0  
   store_id  product_id    cpi_id                       cpi_name
0         1           3  212222.0                   chocolate 10
1         1           5  210321.0   pack of 5-6 individual cakes
2         1          17  210218.0           popcorn, bag, 10-30g
3         1          19  212732.0                     lemon each
4         1          54  210905.0  fresh/chilled chicken  per kg


In [None]:
# Disable the row limit for Altair
alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [None]:
# Sample 5000 rows from the dataset
prices_df = prices_df.sample(n=5000, random_state=42)


In [None]:
# Filter data to include only prices <= £20
filtered_prices = prices_df[prices_df['price'] <= 20]

# Create the histogram
histogram = alt.Chart(filtered_prices).mark_bar(
    color='skyblue', stroke='black'
).encode(
    x=alt.X(
        'price:Q',
        bin=alt.Bin(step=0.5),  # Bin size
        title='Price (£)'
    ),
    y=alt.Y(
        'count()',
        title='Frequency'
    ),
    tooltip=[
        alt.Tooltip('price:Q', title='Price (£)', format=".2f"),
        alt.Tooltip('count():Q', title='Frequency')
    ]
).properties(
    title='Histogram of Product Prices (Up to £20)',
    width=800,
    height=500
).configure_axis(
    grid=True,
    labelFontSize=12,
    titleFontSize=14
).configure_view(
    strokeWidth=0  # Remove border around chart
)

# Display the histogram
histogram.show()


In [None]:
# Create the histogram without black edges
histogram = alt.Chart(filtered_prices).mark_bar(
    color='skyblue'  # Removed stroke='black'
).encode(
    x=alt.X(
        'price:Q',
        bin=alt.Bin(step=0.5),  # Bin size
        title='Price (£)'
    ),
    y=alt.Y(
        'count()',
        title='Frequency'
    ),
    tooltip=[
        alt.Tooltip('price:Q', title='Price (£)', format=".2f"),
        alt.Tooltip('count():Q', title='Frequency')
    ]
).properties(
    title='Histogram of Product Prices (Up to £20)',
    width=800,
    height=500
).configure_axis(
    grid=True,
    labelFontSize=12,
    titleFontSize=14
).configure_view(
    strokeWidth=0  # Remove border around chart
)

# Display the histogram
histogram.show()


The histogram visualises the distribution of product prices (≤ £20), showing the frequency of different price ranges and identifying common pricing patterns.

In [None]:
# Convert 'date' column to datetime
prices_df['date'] = pd.to_datetime(prices_df['date'])

# Filter data for scatter plot
scatter_data = prices_df[(prices_df['price'] <= 20) & (prices_df['date'] >= '2023-01-01')]

# Create the scatter plot
scatter_plot = alt.Chart(scatter_data).mark_point(
    color='dodgerblue', filled=True, opacity=0.6
).encode(
    x=alt.X('date:T', title='Date'),
    y=alt.Y('price:Q', title='Price (£)'),
    tooltip=[
        alt.Tooltip('date:T', title='Date'),
        alt.Tooltip('price:Q', title='Price (£)', format=".2f")
    ]
).properties(
    title='Scatter Plot of Prices Over Time',
    width=800,
    height=500
)

# Add a regression line
regression_line = scatter_plot.transform_regression(
    'date', 'price', method='linear'
).mark_line(color='red', strokeWidth=2)

# Combine scatter plot and regression line
scatter_with_regression = scatter_plot + regression_line

# Display the chart
scatter_with_regression.show()


In [None]:
# Aggregate data: Average price per day
aggregated_data = prices_df[prices_df['price'] <= 20].groupby('date', as_index=False).agg({'price': 'mean'})

# Create the scatter plot with aggregated data
scatter_plot = alt.Chart(aggregated_data).mark_point(
    color='dodgerblue', filled=True, size=60
).encode(
    x=alt.X('date:T', title='Date'),
    y=alt.Y('price:Q', title='Average Price (£)'),
    tooltip=[
        alt.Tooltip('date:T', title='Date'),
        alt.Tooltip('price:Q', title='Average Price (£)', format=".2f")
    ]
).properties(
    title='Scatter Plot of Average Prices Over Time',
    width=800,
    height=500
)

# Add a regression line to show the trend
regression_line = scatter_plot.transform_regression(
    'date', 'price', method='linear'
).mark_line(color='red', strokeWidth=2)

# Combine scatter plot and regression line
scatter_with_regression = scatter_plot + regression_line

# Display the updated chart
scatter_with_regression.show()


This scatter plot shows the daily average prices over time, highlighting trends in product pricing with a regression line indicating the overall direction.