In [134]:
library(tidyverse)
library(dplyr)
data("starwars", package = "dplyr")

# Q1 — Dataset Inspection and Missing Values (5 points)
#glimpse(starwars)
# observations: 87
# variables: 14

# Missing Values
starwars %>%
    select(height, mass, homeworld) %>%
    summarize(across(everything(), ~ sum(is.na(.))))

# The missing values for height are 6, for mass 28, and for homeworld 10.

# Q2 — Create a Wide Summary Table (pivot_wider) (7 points)
data_wide <- starwars %>%
    filter(!is.na(species)) %>% # Filter the dataset to include only characters with non-missing species.
    group_by(species, gender) %>% # Group the data by species and gender.
    summarize(mean_height = mean(height, na.rm = TRUE), .groups = "drop") %>%
    pivot_wider(names_from = gender, values_from = mean_height)# Use pivot_wider() so that: rows represent species, columns represent gender, values are mean height

data_wide
# Q3 — Convert Wide Table Back to Long Format (pivot_longer) (6 points)
data_long <- data_wide %>%
    pivot_longer(
        cols = c(feminine, masculine),
        names_to = 'gender',
        values_to = 'mean_height'
    ) %>%
    drop_na
data_long

# Q4 — Create New Variables and Handle Missing Data (6 points)
data_long_mutated <- starwars %>%
    mutate(bmi = (mass/(height/100)^2)) %>%
    mutate(height_category =
           case_when(
               height < 170 ~ "short",
               height  >= 190 ~ "tall",
               TRUE ~ "average"
           )
        
    ) %>% 
    mutate(homeworld, homeworld = replace_na(homeworld,'Unknown')) %>%
    glimpse()


# Q5 — Unnesting and Interpretation (6 points)

starwars %>%
    select(name, films) %>%
    unnest(films) %>%
    count(name, sort=TRUE) %>%
    head(8)

height,mass,homeworld
<int>,<int>,<int>
6,28,10


species,masculine,feminine
<chr>,<dbl>,<dbl>
Aleena,79.0,
Besalisk,198.0,
Cerean,198.0,
Chagrian,196.0,
Clawdite,,168.0
Droid,140.0,96.0
Dug,112.0,
Ewok,88.0,
Geonosian,183.0,
Gungan,208.6667,


species,gender,mean_height
<chr>,<chr>,<dbl>
Aleena,masculine,79.0
Besalisk,masculine,198.0
Cerean,masculine,198.0
Chagrian,masculine,196.0
Clawdite,feminine,168.0
Droid,feminine,96.0
Droid,masculine,140.0
Dug,masculine,112.0
Ewok,masculine,88.0
Geonosian,masculine,183.0


Rows: 87
Columns: 16
$ name            [3m[90m<chr>[39m[23m "Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "Le…
$ height          [3m[90m<int>[39m[23m 172, 167, 96, 202, 150, 178, 165, 97, 183, 182, 188, 1…
$ mass            [3m[90m<dbl>[39m[23m 77.0, 75.0, 32.0, 136.0, 49.0, 120.0, 75.0, 32.0, 84.0…
$ hair_color      [3m[90m<chr>[39m[23m "blond", NA, NA, "none", "brown", "brown, grey", "brow…
$ skin_color      [3m[90m<chr>[39m[23m "fair", "gold", "white, blue", "white", "light", "ligh…
$ eye_color       [3m[90m<chr>[39m[23m "blue", "yellow", "red", "yellow", "brown", "blue", "b…
$ birth_year      [3m[90m<dbl>[39m[23m 19.0, 112.0, 33.0, 41.9, 19.0, 52.0, 47.0, NA, 24.0, 5…
$ sex             [3m[90m<chr>[39m[23m "male", "none", "none", "male", "female", "male", "fem…
$ gender          [3m[90m<chr>[39m[23m "masculine", "masculine", "masculine", "masculine", "f…
$ homeworld       [3m[90m<chr>[39m[23m "Tatooine", "Tatooine", "Naboo", "Tatooine", 

name,n
<chr>,<int>
R2-D2,7
C-3PO,6
Obi-Wan Kenobi,6
Chewbacca,5
Leia Organa,5
Luke Skywalker,5
Palpatine,5
Yoda,5


#### In 2–3 sentences, explain why this data must be converted to long format.
The films column in the dataset is a list-column, wherein multiple characters can appear in multiple films. We convert to long format and unnest to have each film appear separate and thus, easier to count and analyze.
