# Solutions to Exercises

## Unit 4.1: Data visualization with matplotlib

### Challenge

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv("data/dutch_municipalities.csv", sep=';')

plt.hist(df[df['population'] >= 20000]["population"], bins=50)
plt.title("Size of Municipalities")
plt.xlabel("inhabitants")
plt.ylabel("# municipalities")
plt.show()

plt.hist(df["population"], bins=50)
plt.title("Size of Municipalities")
plt.xlabel("inhabitants")
plt.ylabel("# municipalities")
plt.xlim(20000) # BEWARE: it doesn't change the distribution
plt.show()

plt.hist(df["population"], bins=50, range=(20000, 1000000)) # DOWNSIDE: you need to explicitly set the upper end of the domain
plt.title("Size of Municipalities")
plt.xlabel("inhabitants")
plt.ylabel("# municipalities")
plt.show()

### Question 0

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# import menu and display the first two rows of the dataframe
menu = pd.read_csv("data/mcdonalds_menu.csv", sep=';')

# determine number of items and create barplot
print("Question 0:")
print("Distribution of the calories:")
plt.hist(menu['Calories'])
plt.xlabel('Calories')
plt.ylabel('Items')
plt.show()

### Question 1

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# import menu and display the first two rows of the dataframe
menu = pd.read_csv("data/mcdonalds_menu.csv", sep=';')
print(menu.head(5))

# display simple statistics about the data frame
print(menu.describe())

# determine number of items and create barplot
print("Question 1:")
print("Total number of items:", len(menu.Item.unique()))
menu.groupby('Category')['Item'].count().plot(kind='bar')
plt.show()

In [None]:
# Barplot using matplotlib
items_per_category = menu.groupby('Category')['Item'].count()
df = items_per_category.reset_index()
x = df['Category']
y = df['Item']
plt.bar(x, y)
plt.xticks(rotation=90)
plt.ylabel('Items')
plt.show()

### Question 2

In [None]:
# analysis fat per category
print("Question 2:")
menu.boxplot(column=['Total Fat (% Daily Value)'], by=['Category'], rot=90)
plt.show()

"""
That created a boxplot using Pandas. To do it using matplotlib:
categories = []
values = []
for lbl, grp in menu.groupby('Category'):
    categories.append(lbl)
    values.append(grp['Total Fat (% Daily Value)'].tolist())
plt.boxplot(values, tick_labels = categories)
plt.xticks(rotation=90)
plt.show()
"""

grp_by_category = menu[['Category', 'Total Fat (% Daily Value)','Trans Fat','Saturated Fat (% Daily Value)', 'Cholesterol (% Daily Value)' ]].groupby(['Category']).max() #extracting the wanted columns, grouping by categories and calculating the max
grp_by_category.reset_index(inplace=True) #resetting the index (otherwise category is the new index and it messes up with merge)
grp_by_category.columns=['Category', 'Max_Fat', 'Max_Trans_Fat', 'Max_Sat_Fat', 'Max_Cholestrol'] #renaming the columns
print(grp_by_category) #displaying the new dataframe

df = menu.merge(grp_by_category) #merging the two dataframes by the only common column ("Category")
mask = df['Total Fat (% Daily Value)'] == df.Max_Fat #creating the mask that will be used for the selection
fatty_menu = df.loc[mask, ['Category','Item','Total Fat (% Daily Value)','Cholesterol (% Daily Value)']] #selection the items that correspond to the max of total fat (%daily value) per category
print(fatty_menu) #displaying the dataframe

trans_menu = df.loc[(df['Trans Fat'] == df.Max_Trans_Fat) & (df['Trans Fat']>0)][['Category','Item','Total Fat (% Daily Value)','Trans Fat','Saturated Fat (% Daily Value)','Cholesterol (% Daily Value)']] #creating a new filter
print(trans_menu.sort_values(by='Trans Fat',ascending=False)) #displaying the dataframe sorted by Trans Fat (decreasing order)


### Question 3

In [None]:
# top 10 vitamin C
print("Question 3:")
pd.pivot_table(menu, index=['Item'], values=['Vitamin C (% Daily Value)']).sort_values(['Vitamin C (% Daily Value)'], ascending=False)[:10].plot(kind="bar")
plt.show()

### Question 4

In [None]:
# nutrition feature comparison
print("Question 4:")
selection = menu.loc[:,['Calories', 'Total Fat', 'Saturated Fat', 'Cholesterol', 'Sodium', 'Carbohydrates', 'Sugars', 'Protein']]
pd.plotting.scatter_matrix(selection, diagonal='kde', figsize=(12,12), grid=True)
plt.show()

## Unit 4.2: Working with date and time

### Date manipulations challenge

In [None]:
from datetime import date

# 1. How do we create a `date` object using a string? I.e., how do we create the date object that corresponds to '1785-01-07'?

# Failed attempt 1: setting a variable of a date object is not allowed
# my_date = date.today()
# my_date.year = 1785
# my_date.month = 1
# my_date.day = 7

# Successful attempt 2: but this is not reading from a string!
print('By hand:', date(year=1785, month=1, day=7))

# Successful attempt 3: parsing the string
date_string =  '1785-01-07'
parts = date_string.split('-')
parts = [int(part) for part in parts]
print('With string parsing:', date(year=parts[0], month=parts[1], day=parts[2]))

# Successful attempt 4: using the datetime functionalities
print('With date.fromisoformat:', date.fromisoformat(date_string))

# Successful attempt 5: not relying on ISO format
from datetime import datetime
date_string = '7 January 1785'
print('With datetime.strptime:', datetime.strptime(date_string, '%d %B %Y').date())

# 2. How do I create a `date` object that is exactly like an existing `date` object, 
# but with a different month? For example, 
# let's say you have `date(1785,1,7)` and you want to get a new object with the same date 
# but for the current year?

old_date = date(1785,1,7)

# Failed attempt 1: setting a variable of a date object is not allowed
# old_date.year = 2025

# Successful attempt 2: but this requires typing the current year, and it's long to type
new_date = date(year=2025, month=old_date.month, day=old_date.day)
print('With date constructor:', new_date)

# Successful attempt 3: no more magic numbers, but still too much typing
this_year = date.today().year
new_date = date(year=this_year, month=old_date.month, day=old_date.day)
print('With date constructor and no magic numbers:', new_date)

# Successful attempt 4: with datetime functionalities
new_date = old_date.replace(year=date.today().year)
print('With datetime functionalities:', new_date)
# NOTE THAT THIS DOES NOT MODIFY old_date

# 3. What day of the week were you born?

print('I was born on a', date.strftime(date(1985, 5, 14), format='%A'))

### Grant winners challenge

In [None]:
import pandas as pd
from datetime import date

# Option 1: with Pandas
df = pd.read_csv('data/grant_winners.csv', sep=';', parse_dates=['Award date'])
print(type(df.loc[0, 'Award date']))

# Option 2: with datetime
df = pd.read_csv('data/grant_winners.csv', sep=';')
df['Award date'] = df['Award date'].apply(date.fromisoformat)
print(type(df.loc[0, 'Award date']))

### Question 1

In [None]:
from datetime import datetime

given_date = datetime(2020, 2, 25)
print("Given date is")
print(given_date.strftime('%A %d %B %Y'))

### Question 2

In [None]:
# Option 1: with the time library
import time

milliseconds = int(round(time.time() * 1000))
print(milliseconds)

# Option 2: with the datetime library
from datetime import datetime
print(datetime.now().timestamp() * 1000)

### Question 3

In [None]:
from datetime import datetime

# 2020-02-25
date_1 = datetime(2020, 2, 25).date()
# 2020-09-17
date_2 = datetime(2020, 9, 17).date()

delta = None
if date_1 > date_2:
    print("date_1 is greater")
    delta = date_1 - date_2
else:
    print("date_2 is greater")
    delta = date_2 - date_1
print("Difference is", delta.days, "days")

### Question 4

In [None]:
import pandas as pd
from datetime import date, datetime

df = pd.read_csv('data/grant_winners.csv', sep=';')
df['Award date'] = df['Award date'].apply(date.fromisoformat)
start_date = date(2022,9,1)
end_date = date(2024,8,31)
selected_df = df[(df['Award date'] >= start_date) & (df['Award date'] <= end_date)]
print(selected_df['Title project'].tolist())

# Alternative solution using the datetime-parsing option in read_csv
df = pd.read_csv('data/grant_winners.csv', sep=';', parse_dates=['Award date'])
# Note that Pandas always creates datetime, not date objects; so we need to create datetime objects for the start and end date
start_date = datetime(2022,9,1)
end_date = datetime(2024,8,31)
selected_df = df[(df['Award date'] >= start_date) & (df['Award date'] <= end_date)]
print(selected_df['Title project'].tolist())

# Alternative solution: ISO dates are chronologically and alphabetically ordered the same way
df = pd.read_csv('data/grant_winners.csv', sep=';')
selected_df = df[(df['Award date'] >= '2022-09-01') & (df['Award date'] <= '2024-08-31')]
print(selected_df['Title project'].tolist())

## Unit 4.3: Matrix computation

### Question 0

In [None]:
import numpy as np

# 1. Using the two vectors `v_1` and `v_2` at the beginning of this notebook, create a matrix `m` whose rows are `v_1` and `v_2`.
v_1 = [0.5, 3, 2.5]
v_2 = [-1, 3.5, 2]
m = np.array([v_1, v_2])
print(m)

In [None]:
# 2. Print the average position of all flies in the room (i.e., the average of `v_1` and `v_2`). Do it directly on `m`.
print(m.mean(axis=0))

In [None]:
# 3. Print the distance between the two flies
a_1 = np.array(v_1)
a_2 = np.array(v_2)
a_diff = a_1 - a_2
print(np.linalg.norm(a_diff))

### Question 1

Some items on the menu have 0 sugars, so dividing protein by sugars will result in invalid values. Below we make the assumption that anything that has sugars below 1 actually has sugars *equal* to 1.

In [None]:
import numpy as np
import pandas as pd

# import menu and display the first two rows of the dataframe
menu = pd.read_csv("data/mcdonalds_menu.csv", sep=';')
menu.head(5)

In [None]:
# display simple statistics about the data frame
menu.describe()

In [None]:
# top 3 muscle food
menu['Protein/Sugar'] = np.where(menu['Sugars'] < 1, menu['Protein'] / 1, menu['Protein']/menu['Sugars'])
menu.sort_values('Protein/Sugar', ascending=False).head(10)[['Category', 'Item', 'Protein', 'Sugars', 'Protein/Sugar']]