# Code walkthrough--Pandas and NumPy -- ||   

# L1, L2 and L3
# - Readability over Pythonic code if no hit to time and space complexity , especially when they are important.
# - We will see many more examples in the code-walkthrough sessions when we use Pandas in other chapters.

#   - Pandas Problems:

In [2]:
# Pandas Problems :- 

# Difficulty Level :- L1

# Combine series1 and series2 to form a dataframe.

In [3]:
import numpy as np 
import pandas as pd
ser1 = pd.Series(list('abcdefghijklmnopqrstuvwxyz')) # we have to change this string into list , because series don't accept string directly.
ser2 = pd.Series(np.arange(26))
print(ser1)
print(ser2)

0     a
1     b
2     c
3     d
4     e
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
dtype: object
0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
dtype: int64


In [4]:
# Solutions:- 1

df = pd.concat([ser1, ser2], axis=1) # put them in list is important ser1 and ser2.
print(df)


# print(pd.concat([ser1, ser2])) # this one put them in a single both ser1 and ser2 (not prefer)

    0   1
0   a   0
1   b   1
2   c   2
3   d   3
4   e   4
5   f   5
6   g   6
7   h   7
8   i   8
9   j   9
10  k  10
11  l  11
12  m  12
13  n  13
14  o  14
15  p  15
16  q  16
17  r  17
18  s  18
19  t  19
20  u  20
21  v  21
22  w  22
23  x  23
24  y  24
25  z  25


In [5]:
# Solution:- 2
df = pd.DataFrame({'col1':ser1, 'col2':ser2}) # this is also a good method to do this. using dictionary
print(df)

   col1  col2
0     a     0
1     b     1
2     c     2
3     d     3
4     e     4
5     f     5
6     g     6
7     h     7
8     i     8
9     j     9
10    k    10
11    l    11
12    m    12
13    n    13
14    o    14
15    p    15
16    q    16
17    r    17
18    s    18
19    t    19
20    u    20
21    v    21
22    w    22
23    x    23
24    y    24
25    z    25


# Difficulty level : L2
# Q: from series1 remove items present in series2.

In [6]:
import pandas as pd
series1 = pd.Series([1, 2, 3, 4, 5])
series2 = pd.Series([4, 5, 6, 7, 8])

In [7]:
print(series1.isin(series2)) # this will give boolean values

0    False
1    False
2    False
3     True
4     True
dtype: bool


In [8]:
print(series1[ ~ series1.isin(series2)])

0    1
1    2
2    3
dtype: int64


# Difficulty : L3
# Extract the valid emails from the series emails. The regex pattern for valid email is provided as reference.

In [9]:
emails = pd.Series(['buying books at amazon.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern = '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

In [10]:
import re 
print (emails.str.findall(pattern, flags=re.IGNORECASE))

print(type(emails.str))

0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object
<class 'pandas.core.strings.accessor.StringMethods'>


# Difficulty: L3
# Q:  Get the positions of peaks (values surrounded by smaller values on both sides) in series.
#    - we can apply the same logic we used earlier on NumPy arrays and lists earlier using loops .
#    - I prefer easy to read code if the time-complexity does not change much.
#    - We will also see a pythonic code

In [11]:
## Pythonic Code  Solutions:-

seri = pd.Series([2, 10, 5, 4, 9, 10, 2, 7, 3])

print(np.diff(seri)) # this will give difference between consecutive elements.

print(np.sign(np.diff(seri)))

dd = np.diff(np.sign(np.diff(seri)))
print(dd)


peak_locs = np.where(dd == -2) [0] + 1
print(peak_locs) #  return index



[ 8 -5 -1  5  1 -8  5 -4]
[ 1 -1 -1  1  1 -1  1 -1]
[-2  0  2  0 -2  2 -2]
[1 5 7]


In [12]:
# Alternate approach using Scipy much faster...
from scipy.signal import find_peaks

peaks = find_peaks(seri)
print(peaks)

(array([1, 5, 7]), {})


In [13]:
# Another Solution: using Loops (Non-Pythonic Code)

def findPeaks(str):
    result = [] # resultant peak indices

    if str[0] > str[1]: # boundary case
        result.append(0)

    for i in range(1, str.size-1):
        if (str[i] > str[i - 1] and (str[i] > str[i+1])):
            result.append(i)
        
    if str[str.size-1] > str[str.size-2]: # boundary case
        result.append(str.size-1)

    return result

findPeaks(seri)


[1, 5, 7]

In [14]:
# PYTHONIC FUNCTIONS (Numpy functionality using)
def  find_PeaksPythonic(seri):
    # Note: Numpy operations are implemented in C and Hence, will be much faster.
    dd = np.diff(np.sign(np.diff(seri))) # Additional space O(n)
    peak_loc = np.where(dd == -2) [0] + 1
    return peak_locs

find_PeaksPythonic(seri)

array([1, 5, 7])

In [None]:
# Let Measure the time taken to run  of findPeaks function

sRand = pd.Series(np.random.randint(0, 100, size=500))
print(sRand)

import time 
start_time = time.time() # return the current time , i think

for i in range(1000):
    findPeaks(sRand)

end_time = time.time()

print(" --- %s second ---" % (end_time - start_time))

0      80
1      80
2      69
3      40
4      78
       ..
495    85
496    85
497    75
498    74
499    80
Length: 500, dtype: int32
 --- 2.2957215309143066 second ---


In [16]:
# taken time of find_peaksPythonic 
import time 
start_time = time.time() # return the current time , i think

for i in range(1000):
    find_PeaksPythonic(sRand)

end_time = time.time()

print(" --- %s second ---" % (end_time - start_time))

 --- 0.03193926811218262 second ---


# - Speed of Pythonic code >> Speed of non-pythonic code [In this case due to Numpy arrays being heavily optimized]
# - Space Complexity of pythonic > Space Complexity of simple code
# - Tradeoffs: Time, Space and Readability


In [17]:
# Q: - From a series , Keep the top 2 most frequent items as it is and replace everything else as 'Other.

In [49]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 10, [20]))
print(ser)

0     5
1     4
2     2
3     5
4     5
5     6
6     7
7     4
8     5
9     4
10    8
11    8
12    3
13    5
14    1
15    3
16    1
17    7
18    9
19    2
dtype: int32


In [51]:
print(ser.value_counts())
print(type(ser.value_counts()))

5    5
4    3
2    2
7    2
8    2
1    2
3    2
6    1
9    1
Name: count, dtype: int64
<class 'pandas.core.series.Series'>


In [53]:
print(ser.value_counts().index[:2])

ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
print(ser)

Index([5, 4], dtype='int32')
0         5
1         4
2     Other
3         5
4         5
5     Other
6     Other
7         4
8         5
9         4
10    Other
11    Other
12    Other
13        5
14    Other
15    Other
16    Other
17    Other
18    Other
19    Other
dtype: object


  ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'


# NumPy Problems :- 


In [None]:
# Q: Replace all odd numbers in arr with -1 without changing arr

import numpy as np

In [22]:
arr = np.array([0, 1, 2, 3,4, 5, 6, 7, 8])

In [23]:
out = np.where(arr % 2 == 1, -1, arr) # (Condition, replace, your array)
print(out)
print(arr)

[ 0 -1  2 -1  4 -1  6 -1  8]
[0 1 2 3 4 5 6 7 8]


In [24]:
# Q : Filter the rows of iris_2nd that has petallength (3rd column) > 1.5 and sepallength (1st column) < 5.0

# solution brief:-
# url = ......
# iris_2d = np.genfromtxt(url, delimiter=',' , dtype= 'float' , usecols= [0, 1, 2, 3, 4])
# print(iris_2nd)

In [25]:
# Q : Select the rows of iris_2nd that does not have any nan value.

# solution brief:-
# url = ......
# iris_2d = np.genfromtxt(url, delimiter=',' , dtype= 'float' , usecols= [0, 1, 2, 3, 4])
# print(iris_2nd)

In [26]:
# make values in random columns and rows = NAN
# iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
# print(iris_2d)

In [27]:
# another way:-
# print(~ np.any(np.isnan(row)) for row in iris_2d)

In [28]:
# any_nan_in_row = np.array([~ np.any(np.isnan(row)) for row in iris_2d])

# print(iris_2d[any_nan_in_row])

In [29]:
# Q : Sort the iris dataset based on sepallength column.

# Q: find the duplicate entries (2nd occurrence onwards) in the given numpy array and mark them as true. First time occurrence should be false.

In [None]:
import numpy as np

np.random.seed(100)
a = np.random.randint(0, 5, 10)
print(a)


# output: [false True false true false false  true true true true]

[0 0 3 0 2 4 2 2 2 2]


In [31]:
# Create an all True array
out = np.full(a.shape[0], True) # all True
print(out) # for clarity...

# find the index positions of unique elements
unique_positions = np.unique(a, return_index = True) [1]
print(type(np.unique(a, return_index = True)))

# Mark those positions as false
out[unique_positions] = False

print(out)

[ True  True  True  True  True  True  True  True  True  True]
<class 'tuple'>
[False  True False  True False False  True  True  True  True]


# Takeaways:-

# 1. Break a problems into it's sub components 
# 2. Try to write pythonic code if you can for each sub-component
# 3. Else, use loops and other control flow statements.