# The STAR WARS Character Dataset

In [1]:
import json

## Creating the dataset

In [2]:
with open('../../data/star_wars.json', 'r') as f:
    dataset = json.loads(f.read())

In [3]:
len(dataset)

16

## Analytical Questions

### 1. How many Star Wars Characters are in the dataset?

In [4]:
char_names = []
for c in dataset:
    name = c['name']
    char_names.append(name)

len(char_names)

16

### 2. Find the eye_color of the characters and their counts

You should create a dictionary <code>eye_color</code> with the colors as keys and their counts as values. Counts are the number of characters with that eye color.

Expected output should be a dictionary with keys as colors and values as counts.

<code>
{'brown': 3,
 'blue-gray': 1,
 'blue': 6,
 'yellow': 2,
 'red': 2,
 'black': 1,
 'orange': 1}
</code>

In [5]:
eye_color = {}
for c in dataset:
    color = c['eye_color']
    if color not in eye_color:
        eye_color[color] = 1
    else:
        eye_color[color] += 1

eye_color

{'blue': 6,
 'yellow': 2,
 'red': 2,
 'brown': 3,
 'blue-gray': 1,
 'black': 1,
 'orange': 1}

### 3. What is the most predominant eye_color among all characters, and what is its count?

In [6]:
predom_color = None
max_count = -9999999
for color, count in eye_color.items():
    if count > max_count:
        max_count = count
        predom_color = color

print(predom_color, max_count)

blue 6


### 4. How many female characters are listed?

In [7]:
female_char_count = 0
for c in dataset:
    sex = c['gender']
    if sex == 'female':
        female_char_count += 1

female_char_count  



2

### 5.What is the average height of all characters?

Average height of all the character will be the sum of all character heights divided by the number of characters.

** Convert the individual character height to float before calculating the average.

In [8]:
heights = []
for c in dataset:
    height = float(c['height'])
    heights.append(height)

avg = sum(heights) / len(heights)
avg

169.75

### 6. Find the films directed by George Lucas?

Create a set <code>films_set</code> and store all the films directed by 'George Lucas'.

Example of expected output:

<code>{'The Phantom Menace','Attack of the Clones',...}</code>

In [9]:
films_set = set()  # remember: A set doesn't allow dublicates!!
for entry in dataset:
    for film in entry['films']:
        if film['director'] == 'George Lucas':
            films_set.add(film['title'])

films_set

{'A New Hope',
 'Attack of the Clones',
 'Revenge of the Sith',
 'The Phantom Menace'}

### 7. How many films were directed by George Lucas in total?
Count the number of films directed by George Lucas

In [10]:
len(films_set)

4

### 8. Provide the name of the character who possesses the highest weight among all the characters

Provide the name of the character with the highest weight.

There are some characters whose weight is 'unknown', and you have to ignore those characters.

In [11]:
char_weights = {}
for entry in dataset:
    name = entry['name']
    try:
        weight = int(entry['mass'])
        char_weights[name] = weight
    except:
        continue


top_weight = max(list(char_weights.values()))
for char, value in  char_weights.items():
    if value == top_weight:
        top_name = char

print(top_name)

Jabba Desilijic Tiure


### 9. Name of the character that is a dwarf

Input the name of the character that is a dwarf. A dwarf is a character whose height is less than 100 cm. If there are multiple dwarfs, input the name of the character that has lowest height.

In [12]:
dwarfs = {}
for entry in dataset:
    if int(entry['height']) < 100:
        dwarfs[entry['name']] = int(entry['height'])
        
dwarfs =dict(sorted(dwarfs.items(), key=lambda item: item[1]))
dwarfs

{'R2-D2': 96, 'R5-D4': 97}

### 10. Total number of distinct films

Input the total number of distinct films.

In [13]:
films = set()
for entry in dataset:
    for film in entry['films']:
        films.add(film['title'])

len(films)

6

## Data Transformation

### 11. Films that were produced by more than one person?

Create a dictionary <code>films_dict</code> where the film's title will be the key and the list of producers as its values.

Example of expected output:

<code>{'Black Bird':['Gary Kurtaz','Adult Smith'],
'Return of the Jedi': ['Howard G. Kazanjian','George Lucas','Rick McCallum'],
  ...
}</code>

In [21]:
films_dict = {}

for character in dataset:
    for film in character["films"]:
        producer = film["producer"]
        if len(producer) > 1:
            films_dict[film["title"]] = producer

films_dict

{'A New Hope': ['Gary Kurtz', 'Rick McCallum'],
 'The Empire Strikes Back': ['Gary Kurtz', 'Rick McCallum'],
 'Return of the Jedi': ['Howard G. Kazanjian',
  'George Lucas',
  'Rick McCallum']}

### 12. Films in chronological order

Create a dictionary <code>films_chron</code> where the keys will be the datetime (from the datetime module) of the film and the values will be the titles of the films.

Example of expected output:
<code>
{ 
   datetime.datetime(1999, 5, 19, 0, 0): 'The Phantom Menace',
   datetime.datetime(2002, 5, 16, 0, 0): 'Attack of the Clones',
 ...
 }
 </code>
 
Sort the films by date in ascending order.

In [15]:
from datetime import datetime
def str_to_datetime(date_str):
    try:
        year = int(date_str[0:4])
        month = int(date_str[5:7])
        day = int(date_str[-2:])
        rel_date = datetime(year=year, month=month, day=day)
        return rel_date
    except ValueError as e:
        print(f"Error converting date: {e}")
        return None


In [16]:

films_chron = {}
for entry in dataset:
    for film in entry['films']:
        title_str = film['title']
        rel_date_str = film['release_date']
        rel_date = str_to_datetime(rel_date_str)
        films_chron[rel_date] = title_str

# Sortieren des Dictionaries
films_chron = dict(sorted(films_chron.items()))
films_chron

{datetime.datetime(1977, 5, 25, 0, 0): 'A New Hope',
 datetime.datetime(1980, 5, 17, 0, 0): 'The Empire Strikes Back',
 datetime.datetime(1983, 5, 25, 0, 0): 'Return of the Jedi',
 datetime.datetime(1999, 5, 19, 0, 0): 'The Phantom Menace',
 datetime.datetime(2002, 5, 16, 0, 0): 'Attack of the Clones',
 datetime.datetime(2005, 5, 19, 0, 0): 'Revenge of the Sith'}

### 13. Create a list films containing the title of the films

Create a list <code>films</code> containing the title of the films sorted in ascending lexicographical order (A>Z). <code>films</code> should contain distinct (or unique) values.

Example of expected output:

<code>[
  "A New Hope",
  "Attack of the Clones",
  ...
  "The Phantom Menace"
]</code>

In [17]:
filmsset = set()
for entry in dataset:
    for film in entry['films']:
        title_str =  film['title']
        filmsset.add(title_str)
films = list(sorted(filmsset))
films

['A New Hope',
 'Attack of the Clones',
 'Return of the Jedi',
 'Revenge of the Sith',
 'The Empire Strikes Back',
 'The Phantom Menace']

### 14. Which characters appeared in multiple films?

Create a dictionary <code>characters_dict</code> containing those characters that appeared in MORE than one film (ie: at least, 2 films). The dictonary will map the name of the character to a list of all the films they participated in.

Example of expected output:

<code>{
    'Luke Skywalker': ['A New Hope', 'The Empire Strikes Back', 'Return of the Jedi'],
    'Han Solo': ['A New Hope', 'The Empire Strikes Back', 'Return of the Jedi'],
    'Darth Vader': ['A New Hope', 'The Empire Strikes Back', 'Return of the Jedi', 'Revenge of the Sith'],
    ...
}</code>

In [22]:
characters_dict = {}

for character in dataset:
    name = character["name"]
    if len(character['films']) <= 1:
        continue

    characters_dict[name] = []
    for film in character["films"]:
        characters_dict[name].append(film["title"])

characters_dict

{'Luke Skywalker': ['A New Hope',
  'The Empire Strikes Back',
  'Return of the Jedi',
  'Revenge of the Sith'],
 'C-3PO': ['A New Hope',
  'The Empire Strikes Back',
  'Return of the Jedi',
  'The Phantom Menace',
  'Attack of the Clones',
  'Revenge of the Sith'],
 'R2-D2': ['A New Hope',
  'The Empire Strikes Back',
  'Return of the Jedi',
  'The Phantom Menace',
  'Attack of the Clones',
  'Revenge of the Sith'],
 'Darth Vader': ['A New Hope',
  'The Empire Strikes Back',
  'Return of the Jedi',
  'Revenge of the Sith'],
 'Leia Organa': ['A New Hope',
  'The Empire Strikes Back',
  'Return of the Jedi',
  'Revenge of the Sith'],
 'Owen Lars': ['A New Hope', 'Attack of the Clones', 'Revenge of the Sith'],
 'Beru Whitesun lars': ['A New Hope',
  'Attack of the Clones',
  'Revenge of the Sith'],
 'Obi-Wan Kenobi': ['A New Hope',
  'The Empire Strikes Back',
  'Return of the Jedi',
  'The Phantom Menace',
  'Attack of the Clones',
  'Revenge of the Sith'],
 'Anakin Skywalker': ['The Ph

### 15. Films and their director name

Create a list <code>films_director</code> where each element will be a tuple containing the film title and the director's name. The list should contain distinct values.

Example of expected output:

<code>[('A New Hope', 'George Lucas'),
 ('The Empire Strikes Back', 'Irvin Kershner'),
 ('Return of the Jedi', 'Richard Marquand'),
 ('The Phantom Menace', 'George Lucas'),
 ...
 ]</code>

In [20]:
films_director = list()
for entry in dataset:
    for film in entry['films']:
        title_str = film['title']
        director_str = film['director']
        tup = (title_str, director_str)
        if tup not in films_director:
            films_director.append(tup)

films_director

[('A New Hope', 'George Lucas'),
 ('The Empire Strikes Back', 'Irvin Kershner'),
 ('Return of the Jedi', 'Richard Marquand'),
 ('Revenge of the Sith', 'George Lucas'),
 ('The Phantom Menace', 'George Lucas'),
 ('Attack of the Clones', 'George Lucas')]

## Data Transformation/Aggregation

### 16. Characters homeworld and it's size

Create a set <code>planets_set</code> which will contain the names and diameters of the planets the characters are from in the form of tuples.

Example of expected output:

<code>{('Nal Hutta', 12150),
 ('Rodia', 7549),
 ('Stewjon', 0),
 ...
 }</code>


In [30]:
planets_set = set()
for character in dataset:
    for key, value in character['homeworld'].items():
        if key == 'name':
            planet_name = value
        if key == 'diameter':
            planet_diameter = int(value)
            planets_set.add( (planet_name, planet_diameter) )

planets_set

{('Alderaan', 12500),
 ('Corellia', 11000),
 ('Eriadu', 13490),
 ('Kashyyyk', 12765),
 ('Naboo', 12120),
 ('Nal Hutta', 12150),
 ('Rodia', 7549),
 ('Stewjon', 0),
 ('Tatooine', 10465)}

### 17. Star Wars species with red eyes

Create a set <code>red_eyes_species</code> that contains the names of the species with red eyes.

* Here, we are considering the species associated with a character, and not a character itself.
* If species has multiple eye colors, and one of them is red, then it should be included in the set.

Expected example of output:

<code>{'Wookie',...}</code>

In [39]:
red_eyes_species = set()
for character in dataset:
    for specie in character['species']:
       if 'red' in specie['eye_colors']:
           red_eyes_species.add(specie['name'])

red_eyes_species

{'Hutt', 'Wookie'}

### 18. Characters sharing the same skin color

Create a dictionary called <code>skin_color_character</code>, which contains keys representing different skin colors and values as lists of names of characters who share the same skin color. The list of names of characters should be unique.

Example of expected output:

<code>{'fair': ['Luke Skywalker', 'C-3PO', 'R2-D2', 'Leia Organa', 'Owen Lars'],
 'gold': ['Darth Vader'],
 'white': ['Obi-Wan Kenobi', 'Beru Whitesun lars'],
 'light': ['Chewbacca', 'Han Solo', 'Jabba Desilijic Tiure'],
 'green': ['Yoda'],
  ...
}</code>

In [43]:
skin_color_character = {}
for character in dataset:
    for skin_color in character["skin_color"]:
        if skin_color not in skin_color_character:
            skin_color_character[skin_color] = []
        skin_color_character[skin_color].append(character["name"])


skin_color_character

{'fair': ['Luke Skywalker',
  'Obi-Wan Kenobi',
  'Anakin Skywalker',
  'Wilhuff Tarkin',
  'Han Solo'],
 'gold': ['C-3PO'],
 'white': ['R2-D2', 'Darth Vader', 'R5-D4'],
 'blue': ['R2-D2'],
 'light': ['Leia Organa',
  'Owen Lars',
  'Beru Whitesun lars',
  'Biggs Darklighter'],
 'red': ['R5-D4'],
 'unknown': ['Chewbacca'],
 'green': ['Greedo'],
 'green-tan': ['Jabba Desilijic Tiure'],
 'brown': ['Jabba Desilijic Tiure']}

### 19. Find the characters from the same planet

Create a dictionary <code>from_planet</code> where the planets will be keys and the values will be the set of the character's names.

Example of expected output:

<code>{'Eriadu': {'Wilhuff Tarkin'},
 'Kashyyyk': {'Chewbacca'},
 'Corellia': {'Han Solo'},
 'Rodia': {'Greedo'},
 'Nal Hutta': {'Jabba Desilijic Tiure'}}</code>

In [48]:
from_planet = {}
for character in dataset:
    planet = character['homeworld']
    name = character['name']
    if planet['name'] not in from_planet:
        from_planet[planet['name']] = {name}
    
from_planet

{'Tatooine': {'Luke Skywalker'},
 'Naboo': {'R2-D2'},
 'Alderaan': {'Leia Organa'},
 'Stewjon': {'Obi-Wan Kenobi'},
 'Eriadu': {'Wilhuff Tarkin'},
 'Kashyyyk': {'Chewbacca'},
 'Corellia': {'Han Solo'},
 'Rodia': {'Greedo'},
 'Nal Hutta': {'Jabba Desilijic Tiure'}}

### 20. Names of characters along with the frequency of their appearances in each film.

Create a dictionary <code>character_film_count</code> where the character's name will be the key, and the count of films they appeared in will be the value. There should be no duplicate characters in the dictionary.

Example of expected output:

<code>{
    'Luke Skywalker': 4,
    'Darth Vader': 3,
    'Princess Leia': 3,
    ...
}</code>

In [50]:
character_film_count = {}
for character in dataset:
    character_film_count[character['name']] = len(character['films'])

character_film_count

{'Luke Skywalker': 4,
 'C-3PO': 6,
 'R2-D2': 6,
 'Darth Vader': 4,
 'Leia Organa': 4,
 'Owen Lars': 3,
 'Beru Whitesun lars': 3,
 'R5-D4': 1,
 'Biggs Darklighter': 1,
 'Obi-Wan Kenobi': 6,
 'Anakin Skywalker': 3,
 'Wilhuff Tarkin': 2,
 'Chewbacca': 4,
 'Han Solo': 3,
 'Greedo': 1,
 'Jabba Desilijic Tiure': 3}

### 21. Which characters have appeared in all the films?

Create a list <code>characters_appeared_in_all_films</code> that contains the names of characters who appeared in all the films.

Example of expected output:

<code>['Luke Skywalker', 'Darth Vader']</code>

In [51]:
all_films = set()

for character in dataset:
    for film in character['films']:
        all_films.add(film['title'])

characters_appeared_in_all_films = []
for character in dataset:
    character_films = set()
    for film in character['films']:
        character_films.add(film['title'])
    
    if character_films == all_films:
        characters_appeared_in_all_films.append(character['name'])

characters_appeared_in_all_films

['C-3PO', 'R2-D2', 'Obi-Wan Kenobi']

### 22. Normalize the data to remove any duplicates.

There are some characters who are from the same planet, and we have to consider the planet only once. Create a dictionary <code>planet_characters</code> where the keys will be the planet names and the values will contains the diameter, population and the list of characters from that planet. The characters list should not contain any duplicates and should be sorted in ascending order.

Example of expected output:

<code>{'Naboo': {'diameter': '12120',
  'population': '4500000000',
  'characters': ['R2-D2']},
 'Alderaan': {'diameter': '12500',
  'population': '2000000000',
  'characters': ['Leia Organa']},
 'Stewjon': {'diameter': '0',
  'population': 'unknown',
  'characters': ['Obi-Wan Kenobi']},
 ...
 }</code>

In [58]:
planet_characters = {}

for character in dataset:
    
    if character['homeworld']['name'] not in planet_characters:
        planet_characters[character['homeworld']['name']] = {'diameter' : character['homeworld']['diameter'],
                                                             'population': character['homeworld']['population'],
                                                             'characters' : [character['name']]}
    else:
        planet_characters[character['homeworld']['name']]['characters'].append(character['name'])

for planet in planet_characters:
    planet_characters[planet]['characters'] = sorted(set(planet_characters[planet]['characters']))

planet_characters

{'Tatooine': {'diameter': '10465',
  'population': '200000',
  'characters': ['Anakin Skywalker',
   'Beru Whitesun lars',
   'Biggs Darklighter',
   'C-3PO',
   'Darth Vader',
   'Luke Skywalker',
   'Owen Lars',
   'R5-D4']},
 'Naboo': {'diameter': '12120',
  'population': '4500000000',
  'characters': ['R2-D2']},
 'Alderaan': {'diameter': '12500',
  'population': '2000000000',
  'characters': ['Leia Organa']},
 'Stewjon': {'diameter': '0',
  'population': 'unknown',
  'characters': ['Obi-Wan Kenobi']},
 'Eriadu': {'diameter': '13490',
  'population': '22000000000',
  'characters': ['Wilhuff Tarkin']},
 'Kashyyyk': {'diameter': '12765',
  'population': '45000000',
  'characters': ['Chewbacca']},
 'Corellia': {'diameter': '11000',
  'population': '3000000000',
  'characters': ['Han Solo']},
 'Rodia': {'diameter': '7549',
  'population': '1300000000',
  'characters': ['Greedo']},
 'Nal Hutta': {'diameter': '12150',
  'population': '7000000000',
  'characters': ['Jabba Desilijic Tiure']

### 23. What is the total population of all the homeworlds combined?

Input the total population of all the homeworlds combined. If the population is unknown, ignore that planet.

In [64]:
pops = []
for planet, values in planet_characters.items():
    try:
        pop = int(values['population'])
        pops.append(pop)
    except:
        continue

print(sum(pops))


39845200000


### 24. Films and Their Producers

Create a dictionary films_producers where the keys will be the film titles and the values will be the sets of producers.

In [66]:
films_producers = {}
for character in dataset:
    for film in character['films']:
        for producer in film['producer']:
            if film['title'] not in films_producers.keys():
                producer_set = set()
                producer_set.add(producer)
                films_producers[film['title']] = producer_set
            else:
                films_producers[film['title']].add(producer)

films_producers

{'A New Hope': {'Gary Kurtz', 'Rick McCallum'},
 'The Empire Strikes Back': {'Gary Kurtz', 'Rick McCallum'},
 'Return of the Jedi': {'George Lucas',
  'Howard G. Kazanjian',
  'Rick McCallum'},
 'Revenge of the Sith': {'Rick McCallum'},
 'The Phantom Menace': {'Rick McCallum'},
 'Attack of the Clones': {'Rick McCallum'}}

### 25. Height and mass of each character

Create two dictionaries male_height_mass and female_height_mass where the keys will be the names of the characters and the values will be the tuple of height and mass of the characters. If the height or mass of a character is unknown, take it as 0.

There are some characters whose gender is None and other than male or female. You have to ignore those characters.

Convert the height and mass to int before adding them to the dictionary.

In [None]:
male_height_mass = {}
female_height_mass = {}

for character in dataset:
    name = character['name']
    gender = character['gender']
    try:
        height = int(character['height'])
    except ValueError:
        height = 0
    try:
        mass = int(character['mass'])
    except ValueError:
        mass = 0
    if gender == "male":
        male_height_mass[name] = (height, mass)
    elif gender == "female":
        female_height_mass[name] = (height, mass)
    else:
        continue
