In [2]:
# Series
import pandas as pd

In [3]:
pd.Series([7, 8, 9])

0    7
1    8
2    9
dtype: int64

In [4]:
# assigning series to a variable
results = pd.Series([True, False, True])
results

0     True
1    False
2     True
dtype: bool

In [5]:
# can turn this array into a Series
colors = ["red", "blue", "green"]
colors = pd.Series(colors)
colors

0      red
1     blue
2    green
dtype: object

In [6]:
# everything in pandas is built to be vectorized , this allows us to complete operations on Series of whatever datatype that we have 
# have have the operation be applied to every item within that Series

In [7]:
numbers = pd.Series(range(-3, 3))
numbers

0   -3
1   -2
2   -1
3    0
4    1
5    2
dtype: int64

In [9]:
# here we are doing simple addition - this can be done with any operation
# I am not typing it all because I am lazy
numbers + 1

0   -2
1   -1
2    0
3    1
4    2
5    3
dtype: int64

In [10]:
# it does not however, change the contents of the Series
numbers

0   -3
1   -2
2   -1
3    0
4    1
5    2
dtype: int64

In [15]:
# this is how we can change the variable
numbers = numbers + 1
numbers

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [17]:
# .index returns information about the index 
numbers.index

RangeIndex(start=0, stop=6, step=1)

In [18]:
# .dtype attribute gets the datatype
numbers.dtype

dtype('int64')

In [19]:
# .values gets just the values
numbers.values

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [21]:
# returns the shape of the object we are using it on
# most commonly used on multi-dimensional dicts or lists to get the number of rows and columns
numbers.shape

(6,)

In [22]:
# .value_counts returns a frequency count of values 
# the index is the value
numbers.value_counts()

1    1
2    1
3    1
4    1
5    1
6    1
Name: count, dtype: int64

In [23]:
# mode is the most requently occuring value in a dataset
numbers.mode()

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [24]:
# median is the original middle of the sorted data
numbers.median()

3.5

In [25]:
# average
numbers.mean()

3.5

In [26]:
# standard deviation is a measure of spread
numbers.std()

1.8708286933869707

In [27]:
# .min returns the lowest value in the dataset
numbers.min()

1

In [28]:
# .argmin returns the index of the lowest value
numbers.argmin()

0

In [29]:
# .max() gets the highest value
numbers.max()

6

In [30]:
# .argmax gets the index of the highest value
numbers.argmax()

5

In [31]:
# outputs helpful statistics
numbers.describe()

count    6.000000
mean     3.500000
std      1.870829
min      1.000000
25%      2.250000
50%      3.500000
75%      4.750000
max      6.000000
dtype: float64

In [3]:
# Getting the data that we need out of a  Panda Series
import pandas as pd

In [4]:
ser = range(-2, 3)
ser[0]

-2

In [5]:
ser = pd.Series(ser)
ser

0   -2
1   -1
2    0
3    1
4    2
dtype: int64

In [8]:
# the corresponding index for the first element is True and the rest are False
# Using the corresponding index of a boolean collection to filter is called "Boolean Masking"
# we are sending ina boolean colelction and turns on the value from the series if it is set to true
# that is why we are only seeing the first one in the series
# this does not change the original series
first = [True, False, False, False, False]
ser[first]

0   -2
dtype: int64

In [9]:
# value comparison - returns a boolean series
ser == 1

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [10]:
# we can place the series into a variable
mask = ser == 1
ser[mask]

3    1
dtype: int64

In [11]:
# we can also place the boolean series inside the square brackets
ser[ser == 1]

3    1
dtype: int64

In [13]:
# returns a boolean series that corresponds to the ser Series  
is_negative = ser < 0
is_negative

0     True
1     True
2    False
3    False
4    False
dtype: bool

In [15]:
# we can pass in the boolean series into the ser Series
# this returns the remaining items that are "True"
ser[is_negative]

0   -2
1   -1
dtype: int64

In [16]:
# here we are making a copy of the data
negatives = ser[is_negative]
negatives

0   -2
1   -1
dtype: int64

In [17]:
# the original series remains the unchanged
ser

0   -2
1   -1
2    0
3    1
4    2
dtype: int64

In [18]:
# here is another example
is_odd = ser % 2 == 1
ser[is_odd]

1   -1
3    1
dtype: int64

In [19]:
# working with a new data set
numbers = pd.Series(range(1,13))
numbers

0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
11    12
dtype: int64

In [24]:
# we can use & and | operators on our boolean series to produce more complex behaviour
# parantheses are helpful for order of operations
# if we dont want to use parantheses we can store the expressions in variables
numbers[(numbers == 2) | (numbers == 5)]

1    2
4    5
dtype: int64

In [23]:
# if all expressions evaluate to false, we get an empty series
numbers[(numbers == 2) & (numbers == 5)]

Series([], dtype: int64)

In [28]:
# Boolean Masking is very powerful but what about MODIFYING VALUES in place on a series?
# the .loc method uses the same Boolean series syntax
is_even = numbers & 2 == 0

# for simplicity, lets assign every even number to 200
numbers.loc[is_even] = 200
numbers

0     200
1       2
2       3
3     200
4     200
5       6
6       7
7     200
8     200
9      10
10     11
11    200
dtype: int64

In [29]:
# this is a method for a more dynamic reassignment
numbers = pd.Series(range(1, 13))

# shorthand syntax would be numbers.loc[is_even] *= 2
numbers.loc[is_even] = numbers.loc[is_even] * 2
numbers

0      2
1      2
2      3
3      8
4     10
5      6
6      7
7     16
8     18
9     10
10    11
11    24
dtype: int64

In [41]:
ser = pd.Series(range(-4, 4+1))
ser

0   -4
1   -3
2   -2
3   -1
4    0
5    1
6    2
7    3
8    4
dtype: int64

In [42]:
# is two
is_two = ser == 2 
ser[is_two]

6    2
dtype: int64

In [43]:
ser

0   -4
1   -3
2   -2
3   -1
4    0
5    1
6    2
7    3
8    4
dtype: int64

In [48]:
# is even and positiive
is_even = (ser % 2 == 0)
evens = ser[is_even]
evens

0   -4
2   -2
4    0
6    2
8    4
dtype: int64

In [49]:
is_positive = ser > 0
positives = ser[is_positive]
positives

5    1
6    2
7    3
8    4
dtype: int64

In [53]:
is_even_and_positive = is_even & is_positive
even_positives = ser[is_even_and_positive]
even_positives

6    2
8    4
dtype: int64

In [55]:
is_even_or_positive = is_even | is_positive
even_or_positive = ser[is_even_or_positive]
even_or_positive

0   -4
2   -2
4    0
5    1
6    2
7    3
8    4
dtype: int64

In [56]:
# reassigning numbers using loc that are both even and positive to zero
ser.loc[is_even_and_positive] = 0
ser

0   -4
1   -3
2   -2
3   -1
4    0
5    1
6    0
7    3
8    0
dtype: int64

In [57]:
# using the loc method to multiply negative numbers in the list by 20
ser.loc[ser < 0] = ser.loc[ser < 0] * 20
ser

0   -80
1   -60
2   -40
3   -20
4     0
5     1
6     0
7     3
8     0
dtype: int64

In [58]:
# series and strings
import pandas as pd

In [59]:
fruits = pd.Series(["apple", "orange", "banana", "lemon", "lime", "pineapple", "blueberry", "raspberry", "cranberry", "pear", ])
fruits

0        apple
1       orange
2       banana
3        lemon
4         lime
5    pineapple
6    blueberry
7    raspberry
8    cranberry
9         pear
dtype: object

In [60]:
# .sort_values will sort things in alphabetical order - will not change the original series
fruits.sort_values(ascending=False)

7    raspberry
5    pineapple
9         pear
1       orange
4         lime
3        lemon
8    cranberry
6    blueberry
2       banana
0        apple
dtype: object

In [62]:
# reassigning the series
# above we can see that the index numbers are all out of sorts because we sorted everything
# if we don't want this to happen we can set the ignore index to True
fruits = fruits.sort_values(ignore_index=True)
fruits

0        apple
1       banana
2    blueberry
3    cranberry
4        lemon
5         lime
6       orange
7         pear
8    pineapple
9    raspberry
dtype: object

In [64]:
# capitalization
# of course if we want to change the original series we have to
# reference itself and store it again like we have done above
fruits.str.capitalize()

0        Apple
1       Banana
2    Blueberry
3    Cranberry
4        Lemon
5         Lime
6       Orange
7         Pear
8    Pineapple
9    Raspberry
dtype: object

In [66]:
# contains returns a boolean
# this returned true at index 8 because apple is in pineapple 
# this gives us a hint as to how this string method works
fruits.str.contains("apple")

0     True
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
9    False
dtype: bool

In [68]:
# here we are getting the items that correspond to the Bool Series
fruits[fruits.str.contains("apple")]

0        apple
8    pineapple
dtype: object

In [70]:
# counting the substring occurences - looks like it does this foreach
fruits.str.count("a")

0    1
1    3
2    0
3    1
4    0
5    0
6    1
7    1
8    1
9    1
dtype: int64

In [73]:
# using count wit regular expression character class
# looks like it is counting for each character inside the square brackets
fruits.str.count("[aeiou]")

0    2
1    3
2    3
3    2
4    2
5    2
6    3
7    2
8    4
9    2
dtype: int64

In [75]:
# returns series that have more than two of one of the characters within that selection
fruits[fruits.str.count("[aeiou]") > 2]

1       banana
2    blueberry
6       orange
8    pineapple
dtype: object

In [76]:
# .startswith returns a bool series
fruits.str.startswith("l")

0    False
1    False
2    False
3    False
4     True
5     True
6    False
7    False
8    False
9    False
dtype: bool

In [78]:
# we are throwing in the bool series and it will only show where the bools were true
fruits[fruits.str.startswith("l")]

4    lemon
5     lime
dtype: object

In [79]:
# .endswith does the same thing as endswith
fruits[fruits.str.endswith("berry")]

2    blueberry
3    cranberry
9    raspberry
dtype: object

In [81]:
# gets the number of characters within each item
fruits.str.len()

0    5
1    6
2    9
3    9
4    5
5    4
6    6
7    4
8    9
9    9
dtype: int64

In [82]:
# .lower makes all items lower-case if they werent already
# not showing an exmple, you get it

In [84]:
# using .replace to replace characters and remove characters
prices = pd.Series(["$5.99", "$12.25", "$95", "$4.99", "$43.87",])
prices = prices.str.replace("$", "")
# our variable is still a string
prices * 2

0      5.995.99
1    12.2512.25
2          9595
3      4.994.99
4    43.8743.87
dtype: object

In [87]:
# use .astype to convert the string to a number so that we 
# can use operations on it -- be warned that this is not a string method 
prices = prices.astype(float)
prices * 2

0     11.98
1     24.50
2    190.00
3      9.98
4     87.74
dtype: float64

In [88]:
# upper case conversion
fruits.str.upper()

0        APPLE
1       BANANA
2    BLUEBERRY
3    CRANBERRY
4        LEMON
5         LIME
6       ORANGE
7         PEAR
8    PINEAPPLE
9    RASPBERRY
dtype: object