### Pandas
---

Pandas provides two data structures:

1. Series

    - similar to a one dimensional array

    - a single column of a pandas dataframe

2. Dataframes

    - similar to a matrix or 2 dimensional array

    - a tabular representation of data with rows and columns

#### 2.2.1 Series

- a one dimensional array-like object with indexed-values
- can be created from a list, numpy array or python dictionary

In [4]:
import pandas as pd
import numpy as np

In [8]:
s = pd.Series(np.random.randn(6))

print(f"Series = \n{s}\n")
print(f"series values = {s.values}\n")
print(f"series indecies = {s.index}\n")
print(f"s element data type = {s.dtype}\n")

Series = 
0   -1.230565
1    0.871356
2   -1.144284
3    1.684823
4   -0.791202
5   -0.747542
dtype: float64

series values = [-1.23056463  0.87135595 -1.14428416  1.68482265 -0.79120155 -0.74754181]

series indecies = RangeIndex(start=0, stop=6, step=1)

s element data type = float64



In [18]:
# the indices do not necessarily need to be numbers

s2 = pd.Series(np.random.uniform(-100,100,(7,)),['Jan 1', 'Jan 2', 'Jan 3', 'Jan 4', 'Jan 5', 'Jan 6', 'Jan 7'])

print(f"Series = \n{s2}\n")
print(f"series values = {s2.values}\n")
print(f"series indicies = {s2.index}\n")
print(f"series element data type = {s2.dtype}\n")

Series = 
Jan 1    28.740124
Jan 2    39.498808
Jan 3   -84.496616
Jan 4    76.645044
Jan 5    46.821083
Jan 6    55.583407
Jan 7   -20.214021
dtype: float64

series values = [ 28.74012352  39.49880782 -84.49661621  76.64504439  46.82108305
  55.58340727 -20.21402059]

series indicies = Index(['Jan 1', 'Jan 2', 'Jan 3', 'Jan 4', 'Jan 5', 'Jan 6', 'Jan 7'], dtype='object')

series element data type = float64



In [23]:
capitals = {'MI': 'Lansing', 'CA': 'Sacramento', 'TX': 'Austin', 'MN': 'St Paul'}

s3 = pd.Series(capitals)   # creating a series from dictionary object

print(f'Series s3 =\n{s3}\n')
# Accessing elements of a Series
print('s3[2]=', s3[2])        # display third element of the Series
print('s3[\'CA\']=', s3['CA'])   # indexing element of a Series 
print('\ns3[1:3]=')             # display a slice of the Series
print(s3[1:3])
print('\ns3.iloc([1:3])=')      # display a slice of the Series
print(s3.iloc[1:3])

print('s3.values =', s3.values)   # display values of the Series
print('s3.index=', s3.index)      # display indices of the Series
print('s3.dtype =', s3.dtype)     # display the element type of the Series
print('Shape of s3 =', s3.shape)   # get the dimension of the Series
print('Size of s3 =', s3.size)     # get the number of elements of the Series
print('Count of s3 =', s3.count()) # get the number of non-null elements of the Series

Series s3 =
MI       Lansing
CA    Sacramento
TX        Austin
MN       St Paul
dtype: object

s3[2]= Austin
s3['CA']= Sacramento

s3[1:3]=
CA    Sacramento
TX        Austin
dtype: object

s3.iloc([1:3])=
CA    Sacramento
TX        Austin
dtype: object
s3.values = ['Lansing' 'Sacramento' 'Austin' 'St Paul']
s3.index= Index(['MI', 'CA', 'TX', 'MN'], dtype='object')
s3.dtype = object
Shape of s3 = (4,)
Size of s3 = 4
Count of s3 = 4


  print('s3[2]=', s3[2])        # display third element of the Series


In [27]:
print(f"s2 = \n{s2}\n")

# applying a boolean filter to the series
print(f"s2 >= 0: \n{s2 >= 0}\n")
# using a boolean series filter to index a series object
print(f"s2[s2 >= 0]: \n{s2[s2 >= 0]}\n")

# applying numpy functions
print(f"|s2| = \n{np.abs(s2)}\n")
print(f"s2 - s2 + 1 = \n{np.add(s2-s2, 1)}\n")

s2 = 
Jan 1    28.740124
Jan 2    39.498808
Jan 3   -84.496616
Jan 4    76.645044
Jan 5    46.821083
Jan 6    55.583407
Jan 7   -20.214021
dtype: float64

s2 >= 0: 
Jan 1     True
Jan 2     True
Jan 3    False
Jan 4     True
Jan 5     True
Jan 6     True
Jan 7    False
dtype: bool

s2[s2 >= 0]: 
Jan 1    28.740124
Jan 2    39.498808
Jan 4    76.645044
Jan 5    46.821083
Jan 6    55.583407
dtype: float64

|s2| = 
Jan 1    28.740124
Jan 2    39.498808
Jan 3    84.496616
Jan 4    76.645044
Jan 5    46.821083
Jan 6    55.583407
Jan 7    20.214021
dtype: float64

s2 - s2 + 1 = 
Jan 1    1.0
Jan 2    1.0
Jan 3    1.0
Jan 4    1.0
Jan 5    1.0
Jan 6    1.0
Jan 7    1.0
dtype: float64



In [28]:
colors = pd.Series(['red', 'blue', 'blue', 'yellow', 'red', 'green', 'blue', np.nan])

print(f"colors = \n{colors}\n")

# can retrieve count for all unique values in the colors series
print(f"color_counts = \n{colors.value_counts()}\n")

colors = 
0       red
1      blue
2      blue
3    yellow
4       red
5     green
6      blue
7       NaN
dtype: object

color_counts = 
blue      3
red       2
yellow    1
green     1
Name: count, dtype: int64



#### 2.2.2 Data Frames

- a tabular spreadsheet-like data structure

- composed of multiple Series objects

- can be indexed numerically using two coordinates (row,column)

- can be created from dictionaries, ndarrays, list of tuples

#### Building Dataframes

In [37]:
# creating a dictionary
cars = {'make': ('Ford', 'Honda', 'Toyota', 'Tesla'),
       'model': ('Taurus', 'Accord', 'Camry', 'Model S'),
       'MSRP': (27595, 23570, 23495, 68000)}

# creating a dataframe using the dictionary
carData = pd.DataFrame(cars)
carData

Unnamed: 0,make,model,MSRP
0,Ford,Taurus,27595
1,Honda,Accord,23570
2,Toyota,Camry,23495
3,Tesla,Model S,68000


In [38]:
# print CarData row and column indices

print(f"carData row indices = {np.array(carData.index)}")
print(f"carData column indices = {carData.columns}")

carData row indices = [0 1 2 3]
carData column indices = Index(['make', 'model', 'MSRP'], dtype='object')


In [39]:
# custom dataframe indexing
carData = pd.DataFrame(cars, index=[1,2,3,4])
# creating a new column with same value for all rows
carData['year'] = 2018
# creating another new column
carData['dealership'] = ['Courtesy Ford','Capital Honda','Spartan Toyota','N/A']
carData

Unnamed: 0,make,model,MSRP,year,dealership
1,Ford,Taurus,27595,2018,Courtesy Ford
2,Honda,Accord,23570,2018,Capital Honda
3,Toyota,Camry,23495,2018,Spartan Toyota
4,Tesla,Model S,68000,2018,


In [40]:
# using a list of tuples to create a dataframe
tuplelist = [(2011,45.1,32.4),(2012,42.4,34.5),(2013,47.2,39.2),
              (2014,44.2,31.4),(2015,39.9,29.8),(2016,41.5,36.7)]
columnNames = ['year','temp','precip']
weatherData = pd.DataFrame(tuplelist, columns=columnNames)
weatherData

Unnamed: 0,year,temp,precip
0,2011,45.1,32.4
1,2012,42.4,34.5
2,2013,47.2,39.2
3,2014,44.2,31.4
4,2015,39.9,29.8
5,2016,41.5,36.7


In [42]:
# creating a dataframe from a random numpy matrix
npdata = np.random.normal(size=(5,3))
columns = ['x1', 'x2', 'x3']
data = pd.DataFrame(npdata, columns=columns)
data

Unnamed: 0,x1,x2,x3
0,0.74565,0.557741,-0.268629
1,0.878362,-0.338866,1.68421
2,0.004276,0.521432,0.602834
3,1.148039,-0.106389,-0.281072
4,-0.574139,-1.07335,0.178407


#### Accessing Dataframes

In [44]:
# accessing an entire column will return a series object
print(data['x2'])
print(type(data['x2']))

0    0.557741
1   -0.338866
2    0.521432
3   -0.106389
4   -1.073350
Name: x2, dtype: float64
<class 'pandas.core.series.Series'>


In [49]:
# accessing an entire row will also return a series, indexed using the columns of the original dataframe

print(data)
print(data.iloc[2,:])
print(type(data.iloc[2:]))

print()

print(carData)
print(carData.iloc[2,:])
print(type(carData.iloc[2:]))

         x1        x2        x3
0  0.745650  0.557741 -0.268629
1  0.878362 -0.338866  1.684210
2  0.004276  0.521432  0.602834
3  1.148039 -0.106389 -0.281072
4 -0.574139 -1.073350  0.178407
x1    0.004276
x2    0.521432
x3    0.602834
Name: 2, dtype: float64
<class 'pandas.core.frame.DataFrame'>

     make    model   MSRP  year      dealership
1    Ford   Taurus  27595  2018   Courtesy Ford
2   Honda   Accord  23570  2018   Capital Honda
3  Toyota    Camry  23495  2018  Spartan Toyota
4   Tesla  Model S  68000  2018             N/A
make                  Toyota
model                  Camry
MSRP                   23495
year                    2018
dealership    Spartan Toyota
Name: 3, dtype: object
<class 'pandas.core.frame.DataFrame'>


In [56]:
print(data)
print(data.shape) # dataframe dimensions
print(data.size) # number of elements in dataframe
print()

# selecting specific cells
print(data.loc[2,'x3']) # loc allows both numerical and string indexing
print(data.iloc[2,2]) # iloc is solely for numerical indexing
print()

# slicing dataframes
print(data.iloc[1:3,0:1])

         x1        x2        x3
0  0.745650  0.557741 -0.268629
1  0.878362 -0.338866  1.684210
2  0.004276  0.521432  0.602834
3  1.148039 -0.106389 -0.281072
4 -0.574139 -1.073350  0.178407
(5, 3)
15

0.6028343985107995
0.6028343985107995

         x1
1  0.878362
2  0.004276


In [None]:
# filtering dataframes

print(carData)
