In [1]:
import numpy as np
import pandas as pd

### Pandas create a new Series

In [2]:
s = pd.Series((1,2,3,4,5))
print(s)

0    1
1    2
2    3
3    4
4    5
dtype: int64


print pandas series index

In [3]:
print(s.index)

RangeIndex(start=0, stop=5, step=1)


In [4]:
print(s[0])

1


In [5]:
s = pd.Series(np.random.randn(3), ('a','b','c'))
print(s)

a   -0.066486
b   -1.470728
c    0.153810
dtype: float64


In [6]:
print(s.index)

Index(['a', 'b', 'c'], dtype='object')


In [7]:
print(s[0])

-0.06648596321002931


In [8]:
print(s.mean())
print(s.sum())

-0.4611347299839294
-1.3834041899517882


### convert pandas series to python dictonary

In [9]:
s = dict(s)
print(s)

{'a': -0.06648596321002931, 'b': -1.4707280099104114, 'c': 0.15380978316865257}


### convert dictionary to series

In [10]:
s = pd.Series(s)
print(s)

a   -0.066486
b   -1.470728
c    0.153810
dtype: float64


## Difference between numpy array and pandas series

### in numpy you can only access array using indexing, but in pandas you can access elements using names that you define

In [13]:
arr = np.array((1,2,3,4,5))
print(arr)
print(arr[0])  ## you can only access element with indexing

[1 2 3 4 5]
1


In [12]:
# but in pandas series you can access elements with a name you define (like dictionaries)
print(s['a'])

-0.06648596321002931


### pandas series can hold different data types unlike numpy array

In [14]:
arr[0] = "yello"

ValueError: invalid literal for int() with base 10: 'yello'

In [15]:
print(s)

a   -0.066486
b   -1.470728
c    0.153810
dtype: float64


In [16]:
s['a'] = "Yello series"
s['c'] = 3

In [17]:
print(s)

a    Yello series
b       -1.470728
c               3
dtype: object


In [33]:
# We create a Pandas Series that stores a grocery list
groceries = pd.Series(data = [[30,20], 6, 'Yes', 'No'], index = ['eggs', 'apples', 'milk', 'bread'])

# We display the Groceries Pandas Series
groceries


eggs      [30, 20]
apples           6
milk           Yes
bread           No
dtype: object

In [19]:
print('Groceries has shape:', groceries.shape)
print('Groceries has dimension:', groceries.ndim)
print('Groceries has a total of', groceries.size, 'elements')


Groceries has shape: (4,)
Groceries has dimension: 1
Groceries has a total of 4 elements


In [20]:
# We print the index and data of Groceries
print('The data in Groceries is:', groceries.values)
print('The index of Groceries is:', groceries.index)


The data in Groceries is: [list([30, 20]) 6 'Yes' 'No']
The index of Groceries is: Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')


In [21]:
print(groceries['eggs'])

[30, 20]


In [22]:
print('Car' in groceries)
print('eggs' in groceries)

print(6 in groceries.values)

print(20 in groceries['eggs'])

False
True
True
True


In [23]:
# we can access multiple index labels
print('Do we need milk and bread:\n', groceries[['milk', 'bread']]) 
print()

Do we need milk and bread:
 milk     Yes
bread     No
dtype: object



In [24]:
print(groceries[0]) 
print()


[30, 20]



In [25]:
print('How many eggs and apples do we need to buy:\n',  groceries[[0, 1]]) 
print()


How many eggs and apples do we need to buy:
 eggs      [30, 20]
apples           6
dtype: object



In [26]:
# we use loc to access multiple index labels
print('How many eggs and apples do we need to buy:\n', groceries.loc[['eggs', 'apples']]) 
print()


How many eggs and apples do we need to buy:
 eggs      [30, 20]
apples           6
dtype: object



In [27]:
# we use iloc to access multiple numerical indices
print('Do we need milk and bread:\n', groceries.iloc[[2, 3]]) 


Do we need milk and bread:
 milk     Yes
bread     No
dtype: object


In [28]:
print(groceries)

eggs      [30, 20]
apples           6
milk           Yes
bread           No
dtype: object


In [29]:
print('We remove apples (out of place):\n', groceries.drop('apples'))


We remove apples (out of place):
 eggs     [30, 20]
milk          Yes
bread          No
dtype: object


In [30]:
print(groceries)

eggs      [30, 20]
apples           6
milk           Yes
bread           No
dtype: object


In [31]:
groceries = groceries.drop('apples')
print(groceries)

eggs     [30, 20]
milk          Yes
bread          No
dtype: object


In [34]:
print(groceries)

eggs      [30, 20]
apples           6
milk           Yes
bread           No
dtype: object


In [35]:
groceries.drop('apples', inplace = True)

In [36]:
print(groceries)

eggs     [30, 20]
milk          Yes
bread          No
dtype: object


## Arithmetic operations with pandas

In [37]:
# We create a Pandas Series that stores a grocery list of just fruits
fruits= pd.Series(data = [10, 6, 3,], index = ['apples', 'oranges', 'bananas'])

# We display the fruits Pandas Series
fruits


apples     10
oranges     6
bananas     3
dtype: int64

In [38]:
print(fruits+2)
print("========")
print(np.sqrt(fruits))
print("========")
print(np.power(fruits,2))

apples     12
oranges     8
bananas     5
dtype: int64
apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64
apples     100
oranges     36
bananas      9
dtype: int64


In [39]:
fruits['apples'] = fruits['apples'] + 2
print(fruits)

apples     12
oranges     6
bananas     3
dtype: int64


In [40]:
print(fruits.loc[['apples', 'oranges']])

apples     12
oranges     6
dtype: int64


In [41]:
print('We half the amount of apples and oranges:\n', fruits.loc[['apples', 'oranges']] / 2)


We half the amount of apples and oranges:
 apples     6.0
oranges    3.0
dtype: float64


In [42]:
print(fruits)

apples     12
oranges     6
bananas     3
dtype: int64


In [43]:
fruits['apples'] = "Green Apple"
print(fruits)

apples     Green Apple
oranges              6
bananas              3
dtype: object


In [44]:
print(fruits['apples']*2)

Green AppleGreen Apple


In [45]:
print(fruits['apples']/2)

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [46]:
print(fruits['apples']+2)

TypeError: can only concatenate str (not "int") to str

In [47]:
print(s)

a    Yello series
b       -1.470728
c               3
dtype: object


## Pandas Dataframes

In [48]:
# We create a dictionary of Pandas Series 
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}

items

{'Bob': bike     245
 pants     25
 watch     55
 dtype: int64,
 'Alice': book        40
 glasses    110
 bike       500
 pants       45
 dtype: int64}

In [49]:
shopping_carts = pd.DataFrame(items)
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [50]:
# We print some information about shopping_carts
print('shopping_carts has shape:', shopping_carts.shape)
print('shopping_carts has dimension:', shopping_carts.ndim)
print('shopping_carts has a total of:', shopping_carts.size, 'elements')
print()
print('The data in shopping_carts is:\n', shopping_carts.values)
print()
print('The row index in shopping_carts is:', shopping_carts.index)
print()
print('The column index in shopping_carts is:', shopping_carts.columns)


shopping_carts has shape: (5, 2)
shopping_carts has dimension: 2
shopping_carts has a total of: 10 elements

The data in shopping_carts is:
 [[245. 500.]
 [ nan  40.]
 [ nan 110.]
 [ 25.  45.]
 [ 55.  nan]]

The row index in shopping_carts is: Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')

The column index in shopping_carts is: Index(['Bob', 'Alice'], dtype='object')


In [51]:
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [52]:
print(shopping_carts['Bob'])
print("===========")
print(shopping_carts.loc['bike'])
print("===========")
print(shopping_carts.loc['bike']['Bob'])

bike       245.0
book         NaN
glasses      NaN
pants       25.0
watch       55.0
Name: Bob, dtype: float64
Bob      245.0
Alice    500.0
Name: bike, dtype: float64
245.0


In [53]:
# We create a dictionary of Pandas Series without indexes
data = {'Bob' : pd.Series([245, 25, 55]),
        'Alice' : pd.Series([40, 110, 500, 45])}

# We create a DataFrame
df = pd.DataFrame(data)

# We display the DataFrame
df


Unnamed: 0,Bob,Alice
0,245.0,40
1,25.0,110
2,55.0,500
3,,45


In [54]:
items

{'Bob': bike     245
 pants     25
 watch     55
 dtype: int64,
 'Alice': book        40
 glasses    110
 bike       500
 pants       45
 dtype: int64}

In [55]:
# We Create a DataFrame that only has Bob's data
bob_shopping_cart = pd.DataFrame(items, columns=['Bob'])

# We display bob_shopping_cart
bob_shopping_cart


Unnamed: 0,Bob
bike,245
pants,25
watch,55


### Dealing with NaN

In [56]:
x =  shopping_carts.isnull().sum().sum()
x

3

In [59]:
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [57]:
shopping_carts.isnull()


Unnamed: 0,Bob,Alice
bike,False,False
book,True,False
glasses,True,False
pants,False,False
watch,False,True


In [60]:
# drop any row with NaN value
print(shopping_carts.dropna(axis = 0))


         Bob  Alice
bike   245.0  500.0
pants   25.0   45.0


In [61]:
# drop any column with NaN value
print(shopping_carts.dropna(axis = 1))


Empty DataFrame
Columns: []
Index: [bike, book, glasses, pants, watch]


In [62]:
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [63]:
print(shopping_carts.fillna(0))


           Bob  Alice
bike     245.0  500.0
book       0.0   40.0
glasses    0.0  110.0
pants     25.0   45.0
watch     55.0    0.0


In [64]:
# We replace NaN values with the previous value in the row
print(shopping_carts.fillna(method = 'ffill', axis = 0))


           Bob  Alice
bike     245.0  500.0
book     245.0   40.0
glasses  245.0  110.0
pants     25.0   45.0
watch     55.0   45.0


In [65]:
# We replace NaN values with the previous value in the col
print(shopping_carts.fillna(method = 'ffill', axis = 1))


           Bob  Alice
bike     245.0  500.0
book       NaN   40.0
glasses    NaN  110.0
pants     25.0   45.0
watch     55.0   55.0


In [66]:
# We replace NaN values by using linear interpolation using column values
print(shopping_carts.interpolate(method = 'linear', axis = 0))

                Bob  Alice
bike     245.000000  500.0
book     171.666667   40.0
glasses   98.333333  110.0
pants     25.000000   45.0
watch     55.000000   45.0


In [67]:
# We replace NaN values by using linear interpolation using column values
print(shopping_carts.interpolate(method = 'linear', axis = 1))


           Bob  Alice
bike     245.0  500.0
book       NaN   40.0
glasses    NaN  110.0
pants     25.0   45.0
watch     55.0   55.0


## Loading data frames from CSV File

In [68]:
# We load Google stock data in a DataFrame
Google_stock = pd.read_csv('./GOOG.csv')

# We print some information about Google_stock
print('Google_stock is of type:', type(Google_stock))
print('Google_stock has shape:', Google_stock.shape)

Google_stock is of type: <class 'pandas.core.frame.DataFrame'>
Google_stock has shape: (3313, 7)


In [69]:
Google_stock.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2004-08-19,49.676899,51.693783,47.669952,49.845802,49.845802,44994500
1,2004-08-20,50.178635,54.187561,49.925285,53.80505,53.80505,23005800
2,2004-08-23,55.017166,56.373344,54.172661,54.346527,54.346527,18393200
3,2004-08-24,55.260582,55.439419,51.450363,52.096165,52.096165,15361800
4,2004-08-25,52.140873,53.651051,51.604362,52.657513,52.657513,9257400


In [70]:
Google_stock.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
3308,2017-10-09,980.0,985.424988,976.109985,977.0,977.0,891400
3309,2017-10-10,980.0,981.570007,966.080017,972.599976,972.599976,968400
3310,2017-10-11,973.719971,990.710022,972.25,989.25,989.25,1693300
3311,2017-10-12,987.450012,994.119995,985.0,987.830017,987.830017,1262400
3312,2017-10-13,992.0,997.210022,989.0,989.679993,989.679993,1157700
