# Solutions to Exercises

This notebook is based on Anna-Lena Lamprecht's CoTaPP repository (https://github.com/annalenalamprecht/CoTaPP). Some modifications were made.

## Unit 3.1: Object-Oriented Programming

### 0. Room Occupancy Revisited (Easy Version)

In [None]:
from oop import Room
        
room = Room(101, 4)
print(room.room_number)      # Expected output: 101
print(room.max_occupancy)    # Expected output: 4

### 1. Room Occupancy Revisited (Hard Version)

In [None]:
from oop import HardRoom
            
    
################
# Main program #
################

# create some rooms
all_rooms = set()
all_rooms.add(HardRoom(101, 4))
all_rooms.add(HardRoom(102, 2))
all_rooms.add(HardRoom(201, 3))
all_rooms.add(HardRoom(202, 2))

# 1 - Print occupancy
def print_occupancy():
    for room in all_rooms:
        print(f'Room {room.number}: {len(room.guests)} / {room.max_guests}')
# 2 - Check guest in
def check_in():
    guest = input("Enter name of guest: ")
    number = int(input("Enter room number: "))
    room = [r for r in all_rooms if r.number == number][0]
    if room != None:
        room.checkIn(guest)
    else:
        print("Not a valid room number.")
# 3 - Check guest out
def check_out():
    guest = input("Enter name of guest: ")
    number = int(input("Enter room number: "))
    room = [r for r in all_rooms if r.number == number][0]
    if room != None:
        room.checkOut(guest)
    else:
        print("Not a valid room number.")

# Wrapper program
while True:
    choice = input("Please choose what you want to do (1-print occupancy/2-check in/3-check out/4-exit program: ") 
    if choice == "1":
        print_occupancy()
    elif choice == "2":
        check_in()
    elif choice == "3":
        check_out()
    elif choice == "4":
        print("Goodbye!")
        break
    else:
        print("Invalid input, try again.")

### 2. People at the University

In [None]:
from oop import Person, Student, Lecturer, BachelorStudent, MasterStudent

student1 = BachelorStudent("Alice", "UU", "Biology", "Amsterdam")
student2 = MasterStudent("Bob", "UU", "Chemistry", "Biophysics")
lecturer = Lecturer("Cindy","UU", "Information and Computing Sciences")

student1.printInfo()
student1.setCreditPoints(150)
print(f"{student1.name} has {student1.getCreditPoints()} points.")
student2.printInfo()
student2.setCreditPoints(45)
print(f"{student2.name} has {student2.getCreditPoints()} points.")
lecturer.printInfo()

## Unit 3.2: CSV files, Pandas, tabular data

### In-class challenge

In [None]:
def get_third_teacher():
    with open('data/teachers.txt') as f:
        lines = f.readlines()
    return lines[2]

print(get_third_teacher())

### 1. Interview Anonymization

In [None]:
import pandas as pd

interview_file = "data/interview-with-a-syrian-refugee.csv"
new_file = "data/interview-with-a-syrian-refugee-anonymized.csv"

def anonymize(sentence):
    return sentence.replace('Samira', 'ANONYMOUS')

df = pd.read_csv(interview_file)
df['anonymized_sentence'] = df['sentence'].apply(anonymize)
df[['anonymized_sentence', 'id']].to_csv(new_file)

### 2. Longest Word

In [None]:
import pandas as pd

interview_file = "data/interview-with-a-syrian-refugee.csv"
new_file = "data/interview-with-a-syrian-refugee-longest-word.csv"

def find_longest_word(string):
    
    # Create longest word and length
    longest_word = ''
    longest_length = 0
    
    # Split the string in words
    words = string.split() # By default any whitespace is a separator
    
    # Loop over the words
    for word in words:
        if word.isalpha():  #Only for alphabetic strings:
            if len(word) > longest_length: #Als de lengte van het woord groter is dan langste lenge
                longest_word = word # Voeg woord toe aan langste woord
                longest_length = len(word) # Voeg de lengte van dat woord toe aan de langste lengte
            
    return longest_word

df = pd.read_csv(interview_file)
df['longest_word'] = df['sentence'].apply(find_longest_word)
df[['longest_word', 'id']].to_csv(new_file)

### 3. Randomized Story-Telling

In [None]:
import pandas as pd
import sys
import random

# set path to input file
infile = "data/inputs.csv"

try:
    # read input file as dataframe
    df_in = pd.read_csv(infile, sep=",")

# for any error, display the exception message
except Exception as err:
    print("Something went wrong...")
    print(err) 
    sys.exit()


# ask user how many sentences should be created
while True:
    try:
        number = int(input("How many sentences do you want to create? "))
        break
    except ValueError:
        print("That was no valid number. Try again.") 
    
# create the desired number of sentences
while number > 0:

    # select a random value for each of the four sentence elements
    who = df_in.loc[random.randint(0,df_in["who"].size-1),"who"]
    does_what = df_in.loc[random.randint(0,df_in["does what"].size-1),"does what"]
    how = df_in.loc[random.randint(0,df_in["how"].size-1),"how"]
    where = df_in.loc[random.randint(0,df_in["where"].size-1),"where"]
    
    print(f"{who} {does_what} {how} {where}.")

    number -= 1

### 4. Population and Universities per Province

In [None]:
import pandas as pd
import sys

# set paths to input and output file
infile = "data/dutch_municipalities.csv"
outfile = "data/dutch_provinces.csv"


try:
    # read input file as dataframe
    df_in = pd.read_csv(infile)

# for any error, display the exception message
except Exception as err:
    print("Something went wrong...")
    print(err) 
    sys.exit()

# init new empty dataframe with the wanted columns
df_out = pd.DataFrame(columns=["province", "population", "universities"])

# get province names (as sorted set)
provinces = sorted(set(df_in["province"]))

# for all provinces ...
for province in provinces:
    # get the part of the dataframe for the province
    df_province = df_in[df_in["province"]==province]
    
    # sum up universities and population and add to new data frame
    df_out = df_out.append({"province":province,\
                            "population":df_province["population"].sum(),\
                            "universities":df_province["university"].sum()},\
                            ignore_index=True)
  
try:
    # save new dataframe as csv file
    df_out.to_csv(outfile, index=False)

# for any error, display the exception message
except Exception as err:
    print("Something went wrong...")
    print(err) 
    sys.exit()
    

# Another possible, but longer solution is with the csv package and 
# dictionaries, as shown below (without try/except error handling).
#
# import csv
#
## create two empty dictionaries to collect the aggregated data
#universities_per_province = {}
#population_per_province = {}
#
## read in the data and iterate over all rows, adding up
## population and university numbers per province
#with open("dutch_municipalities.csv", "r") as csvfile:
#    csvreader = csv.DictReader(csvfile, delimiter=',')
#    for row in csvreader:
#        if row["province"] not in universities_per_province:
#            universities_per_province[row["province"]] = int(row["university"])

## Unit 3.3: Join two dataframes, group by and correlations of variables

### 1. Analysis of the McDonald’s Menu

#### Question a

In [None]:
import pandas as pd

# import menu and display the first two rows of the dataframe
menu = pd.read_csv("data/mcdonalds_menu.csv")

# determine number of items and create barplot
print("Total number of items:", len(menu.Item.unique()))
print(menu.groupby('Category')['Item'].count())

The most represented category is "Coffee & Tea"

#### Question b

In [None]:
# analysis fat per category
grp_by_category = menu[['Category', 'Total Fat (% Daily Value)','Trans Fat','Saturated Fat (% Daily Value)', 'Cholesterol (% Daily Value)' ]].groupby(['Category']).max() #extracting the wanted columns, grouping by categories and calculating the max
grp_by_category.reset_index(inplace=True) #resetting the index (otherwise category is the new index and it messes up with merge)
grp_by_category.columns=['Category', 'Max_Fat', 'Max_Trans_Fat', 'Max_Sat_Fat', 'Max_Cholestrol'] #renaming the columns
print(grp_by_category) #displaying the new dataframe

df = menu.merge(grp_by_category) #merging the two dataframes by the only common column ("Category")
mask = df['Total Fat (% Daily Value)'] == df.Max_Fat #creating the mask that will be used for the selection
fatty_menu = df.loc[mask, ['Category','Item','Total Fat (% Daily Value)','Cholesterol (% Daily Value)']] #selection the items that correspond to the max of total fat (%daily value) per category
print(fatty_menu) #displaying the dataframe

trans_menu = df.loc[(df['Trans Fat'] == df.Max_Trans_Fat) & (df['Trans Fat']>0)][['Category','Item','Total Fat (% Daily Value)','Trans Fat','Saturated Fat (% Daily Value)','Cholesterol (% Daily Value)']] #creating a new filter
print(trans_menu.sort_values(by='Trans Fat',ascending=False)) #displaying the dataframe sorted by Trans Fat (decreasing order)

#### Question c

In [None]:
# anything healthy?
healthy = df.loc[(df['Trans Fat']==0) & (df['Sugars']<20) & (df['Total Fat (% Daily Value)']<=20) & (df['Cholesterol (% Daily Value)']==0), ['Category','Item','Calories']].sort_values('Calories', ascending=False)
print(healthy[(healthy['Category']!="Beverages") & (healthy['Category']!="Coffee & Tea")])

#### Question d

In [None]:
# top 10 vitamin C
print("Question 4:")
print(pd.pivot_table(menu, index=['Item'], values=['Vitamin C (% Daily Value)']).sort_values(['Vitamin C (% Daily Value)'], ascending=False)[:10])

#### Question e

In [None]:
# nutrition feature comparison
selection = menu.loc[:,['Calories', 'Total Fat', 'Saturated Fat', 'Cholesterol', 'Sodium', 'Carbohydrates', 'Sugars', 'Protein']]
print(selection.corr())