# Ch6: Learning the Hidden Secrets of Data Wrangling

In [3]:
# !pip install scypy python-Levenshtein

In [6]:
from sys import getsizeof
from itertools import zip_longest

## Advanced List Comprehension and the zip Function

In [1]:
a = [i for i in range(0, 30)]

In [32]:
getsizeof(a)

336

In [31]:
type(a)

list

### Introduction to Generator Expressions

## Exercise 73: Generator Expressions

1) Use list comprehension to generate a list of odd #s between 0 - 100k:

In [4]:
oddNums2 = [x for x in range(100000) if x % 2 != 0]

2) Get the size of the output from using list comprehension:

In [10]:
getsizeof(oddNums2)

406488

In [30]:
type(oddNums2)

list

3) Write the equivalent generator expression:

In [11]:
oddNums = (x for x in range(100000) if x % 2 != 0)

In [12]:
getsizeof(oddNums)

112

In [29]:
type(oddNums)

generator

Notice this is much smaller.

4) Print the first 10 odd numbers:

In [14]:
for i, number in enumerate(oddNums):
    print(number)
    if i > 10:
        break

1
3
5
7
9
11
13
15
17
19
21
23


## Exercise 74: One-Liner Generator Expression

1) Create a word string: 

In [15]:
words = ["We are", "the", "knights\n", "who say\n", "NI!"]

In [20]:
print(words)

['We are', 'the', 'knights\n', 'who say\n', 'NI!']


2) Write a generator expression to read one word at a time, removing newline chars and making lowercase:

In [16]:
moddedWords = (word.strip().lower() for word in words)

In [24]:
moddedWords

<generator object <genexpr> at 0x000001DCCEA54DD0>

In [26]:
type(moddedWords)

generator

3) Create a list comprehension to get words one by one from the generator expression:

In [27]:
wordList = [word for word in moddedWords]

In [28]:
wordList

['we are', 'the', 'knights', 'who say', 'ni!']

In [35]:
wordList_2 = [word for word in (word2.strip().lower() for word2 in words)]

In [36]:
wordList_2

['we are', 'the', 'knights', 'who say', 'ni!']

In [37]:
wordList_3 = [word for word in (word2.strip().lower() for word2 in ["We are", "the", "knights\n", "who say\n", "NI!"])]

In [38]:
wordList_3

['we are', 'the', 'knights', 'who say', 'ni!']

In [44]:
wordList_4 = []
for word in words:
#     word = word.strip().lower()
    wordList_4.append(word.strip().lower())

In [45]:
wordList_4

['we are', 'the', 'knights', 'who say', 'ni!']

## Exercise 75: Extracting a List with Single Words

1) Write the generator expression:

In [50]:
words = ["We are", "the", "knights\n", "who say\n", "NI!"]

In [54]:
moddedWords2 = (w.strip().lower() for word in words for w in word.split(" "))

In [55]:
wordList2 = [word for word in moddedWords2]

In [56]:
wordList2

['we', 'are', 'the', 'knights', 'who', 'say', 'ni!']

2) Write an equivalent using a nested **for** loop:

In [57]:
moddedWords3 = []
for word in words:
    for w in word.split(" "):
        moddedWords3.append(w.strip().lower())
moddedWords3

['we', 'are', 'the', 'knights', 'who', 'say', 'ni!']

#### Independent for loops  
3) Create two lists:

In [58]:
marbles = ['RED', 'GREEN', 'BLUE']
counts = [1, 5, 13]

4) Use a generator expression to get all combinations of marbles / counts:

In [59]:
marbleCount = ((m, c) for m in marbles for c in counts)

In [60]:
marbleCount

<generator object <genexpr> at 0x000001DCCF2573C0>

In [62]:
marbleList = list(marbleCount)

In [63]:
marbleList

[('RED', 1),
 ('RED', 5),
 ('RED', 13),
 ('GREEN', 1),
 ('GREEN', 5),
 ('GREEN', 13),
 ('BLUE', 1),
 ('BLUE', 5),
 ('BLUE', 13)]

This genex creates a tuple in each iteration of the simultaneous **for** loops.  
Here's the equivalent explicit code:

In [61]:
marbleList2 = []
for m in marbles:
    for c in counts:
        marbleList2.append((m, c))
marbleList2

[('RED', 1),
 ('RED', 5),
 ('RED', 13),
 ('GREEN', 1),
 ('GREEN', 5),
 ('GREEN', 13),
 ('BLUE', 1),
 ('BLUE', 5),
 ('BLUE', 13)]

## Exercise 76: The **zip** Function

1) Create two lists:

In [64]:
countries = ["India", "USA", "France", "UK"]
capitals = ["Delhi", "Washington", "Paris", "London"]

2) Generate a list of tuple pairs using the **zip** function:

In [67]:
countriesCapsList  = [t for t in zip(countries, capitals)]

In [68]:
countriesCapsList

[('India', 'Delhi'),
 ('USA', 'Washington'),
 ('France', 'Paris'),
 ('UK', 'London')]

3) Use **dict** to create key-value pairs:

In [69]:
countriesCapsDict = dict(zip(countries, capitals))

In [70]:
countriesCapsDict

{'India': 'Delhi', 'USA': 'Washington', 'France': 'Paris', 'UK': 'London'}

## Exercise 77: Handling Messy Data

If one list is longer than the other, use **zip_longest** from **itertools** module.

1) Create two unequal lists:

In [71]:
countries = ["India", "USA", "France", "UK", "Brazil", "Japan"]
capitals = ["Delhi", "Washington", "Paris", "London"]

2) Create a **dict** inserting **None** in the blanks with no match:

In [73]:
from itertools import zip_longest

In [75]:
countriesCapsDict2 = dict(zip_longest(countries, capitals))

In [76]:
countriesCapsDict2

{'India': 'Delhi',
 'USA': 'Washington',
 'France': 'Paris',
 'UK': 'London',
 'Brazil': None,
 'Japan': None}

## Data Formatting