# Creating Pandas Series

In [2]:
import pandas as pd

In [3]:
groceries = pd.Series(data= [10, 6, 'Yes', 'No'], index=['eggs', 'apples', 'milk', 'bread'])
groceries

eggs       10
apples      6
milk      Yes
bread      No
dtype: object

In [4]:
groceries.shape

(4,)

In [5]:
groceries.ndim

1

In [6]:
groceries.size

4

In [8]:
groceries.index

Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')

In [9]:
groceries.values

array([10, 6, 'Yes', 'No'], dtype=object)

In [10]:
'banana' in groceries

False

In [11]:
'bread' in groceries

True

# Accessing and Deleting Elements in Pandas Series

In [13]:
groceries

eggs       10
apples      6
milk      Yes
bread      No
dtype: object

In [15]:
groceries['eggs']

10

In [16]:
groceries[['milk', 'bread']]

milk     Yes
bread     No
dtype: object

In [17]:
groceries[0]

10

In [18]:
groceries[-1]

'No'

In [20]:
groceries[[0, 1]]

eggs      10
apples     6
dtype: object

In [21]:
groceries.loc[['eggs', 'apples']]

eggs      10
apples     6
dtype: object

In [22]:
groceries.iloc[[2, 3]]

milk     Yes
bread     No
dtype: object

In [24]:
groceries['eggs'] = 12
groceries

eggs       12
apples      6
milk      Yes
bread      No
dtype: object

In [26]:
groceries.drop('apples')

eggs      12
milk     Yes
bread     No
dtype: object

In [27]:
groceries

eggs       12
apples      6
milk      Yes
bread      No
dtype: object

In [28]:
groceries.drop('apples', inplace=True)

In [29]:
groceries

eggs      12
milk     Yes
bread     No
dtype: object

# Arithmetic Operations on Pandas Series

In [31]:
fruits = pd.Series(data=[10, 6, 3], index=['apples', 'oranges','bananas'])
fruits

apples     10
oranges     6
bananas     3
dtype: int64

In [32]:
fruits + 2

apples     12
oranges     8
bananas     5
dtype: int64

In [33]:
fruits - 2

apples     8
oranges    4
bananas    1
dtype: int64

In [34]:
fruits * 2

apples     20
oranges    12
bananas     6
dtype: int64

In [35]:
fruits / 2

apples     5.0
oranges    3.0
bananas    1.5
dtype: float64

In [36]:
import numpy as np

fruits

apples     10
oranges     6
bananas     3
dtype: int64

In [37]:
np.sqrt(fruits)

apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64

In [38]:
np.exp(fruits)

apples     22026.465795
oranges      403.428793
bananas       20.085537
dtype: float64

In [39]:
np.power(fruits, 2)

apples     100
oranges     36
bananas      9
dtype: int64

In [40]:
fruits['bananas'] + 2

5

In [41]:
fruits.iloc[0] - 2

8

In [42]:
fruits[['apples', 'oranges']] * 2

apples     20
oranges    12
dtype: int64

In [43]:
fruits[[0, 1]] / 2

apples     5.0
oranges    3.0
dtype: float64

In [45]:
groceries

eggs      12
milk     Yes
bread     No
dtype: object

In [46]:
groceries * 2

eggs         24
milk     YesYes
bread      NoNo
dtype: object

In [49]:
try:
    groceries / 2
except TypeError:
    print("unsupported operand type(s) for /: 'str' and 'int'")
    print("Operator on pd series MUST work for ALL data types in the series")

unsupported operand type(s) for /: 'str' and 'int'
Operator on pd series MUST work for ALL data types in the series


# Manipulate a Series

In [52]:
import pandas as pd

# Create a Pandas Series that contains the distance of some planets from the Sun.
# Use the name of the planets as the index to your Pandas Series, and the distance
# from the Sun as your data. The distance from the Sun is in units of 10^6 km

distance_from_sun = [149.6, 1433.5, 227.9, 108.2, 778.6]

planets = ['Earth','Saturn', 'Mars','Venus', 'Jupiter']

# Create a Pandas Series using the above data, with the name of the planets as
# the index and the distance from the Sun as your data.
dist_planets = pd.Series(data=distance_from_sun, index=planets)

# Calculate the number of minutes it takes sunlight to reach each planet. You can
# do this by dividing the distance from the Sun for each planet by the speed of light.
# Since in the data above the distance from the Sun is in units of 10^6 km, you can
# use a value for the speed of light of c = 18, since light travels 18 x 10^6 km/minute.
time_light = dist_planets / 18

# Use Boolean indexing to select only those planets for which sunlight takes less
# than 40 minutes to reach them.
close_planets = time_light[time_light < 40]
print(close_planets)

Earth     8.311111
Mars     12.661111
Venus     6.011111
dtype: float64


# Creating Pandas DataFrames

In [4]:
items = {"Bob": pd.Series([246, 25, 55], index=["bike", "pants", "watch"]),
         "Alice": pd.Series([40, 110, 500, 45], index=["book", "glasses", "bike", "pants"])}
type(items)

dict

In [5]:
shopping_carts = pd.DataFrame(items)
shopping_carts

Unnamed: 0,Bob,Alice
bike,246.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [6]:
no_labels = {'Bob': pd.Series([245, 25, 55]),
            'Alice': pd.Series([40, 110, 500, 45])}

df = pd.DataFrame(no_labels)
df

Unnamed: 0,Bob,Alice
0,245.0,40
1,25.0,110
2,55.0,500
3,,45


In [9]:
shopping_carts.columns

Index(['Bob', 'Alice'], dtype='object')

In [10]:
shopping_carts.index

Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')

In [11]:
shopping_carts.values

array([[246., 500.],
       [ nan,  40.],
       [ nan, 110.],
       [ 25.,  45.],
       [ 55.,  nan]])

In [12]:
shopping_carts.size

10

In [13]:
shopping_carts.shape

(5, 2)

In [15]:
shopping_carts.ndim

2

In [16]:
bob_shopping_cart = pd.DataFrame(items, columns=['Bob'])
bob_shopping_cart

Unnamed: 0,Bob
bike,246
pants,25
watch,55


In [18]:
sel_shopping_cart = pd.DataFrame(items, index=['pants', 'book'])
sel_shopping_cart

Unnamed: 0,Bob,Alice
pants,25.0,45
book,,40


In [19]:
alice_sel_shopping_cart = pd.DataFrame(items, index=['glasses', 'bike'], columns=['Alice'])
alice_sel_shopping_cart

Unnamed: 0,Alice
glasses,110
bike,500


In [22]:
data_dict = {'Ints': [1,2,3],
            'Floats': [4.5,6.7,8.9]}

df_2 = pd.DataFrame(data_dict, index=['label 1', 'label 2', 'label 3'])
df_2

Unnamed: 0,Ints,Floats
label 1,1,4.5
label 2,2,6.7
label 3,3,8.9


In [25]:
items_2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants': 5}]

store_items = pd.DataFrame(items_2, index=['store 1', 'store 2'])
store_items

Unnamed: 0,bikes,pants,watches,glasses
store 1,20,30,35,
store 2,15,5,10,50.0


# Accessing Elements in Pandas DataFrames

In [26]:
items_2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants': 5}]

store_items = pd.DataFrame(items_2, index=['store 1', 'store 2'])
store_items

Unnamed: 0,bikes,pants,watches,glasses
store 1,20,30,35,
store 2,15,5,10,50.0


In [27]:
store_items[['bikes']]

Unnamed: 0,bikes
store 1,20
store 2,15


In [29]:
store_items[['bikes', 'pants']]

Unnamed: 0,bikes,pants
store 1,20,30
store 2,15,5


In [31]:
store_items.loc[['store 1']]

Unnamed: 0,bikes,pants,watches,glasses
store 1,20,30,35,


In [32]:
store_items['bikes']['store 2']

15

In [33]:
store_items['shirts'] = [15, 2]
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts
store 1,20,30,35,,15
store 2,15,5,10,50.0,2


In [34]:
store_items['suits'] = store_items['shirts'] + store_items['pants']
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20,30,35,,15,45
store 2,15,5,10,50.0,2,7


In [37]:
add_row = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]

store_items_row = pd.DataFrame(add_row, index=['store 3'])
store_items_row

Unnamed: 0,bikes,pants,watches,glasses
store 3,20,30,35,4


In [38]:
store_items = store_items.append(store_items_row)
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20,30,35,,15.0,45.0
store 2,15,5,10,50.0,2.0,7.0
store 3,20,30,35,4.0,,


In [39]:
store_items['new_watches'] = store_items['watches'][1:]
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits,new_watches
store 1,20,30,35,,15.0,45.0,
store 2,15,5,10,50.0,2.0,7.0,10.0
store 3,20,30,35,4.0,,,35.0


In [41]:
store_items.insert(5, 'shoes', [8, 5, 0])
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,shoes,suits,new_watches
store 1,20,30,35,,15.0,8,45.0,
store 2,15,5,10,50.0,2.0,5,7.0,10.0
store 3,20,30,35,4.0,,0,,35.0


In [42]:
store_items.pop('new_watches')
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,shoes,suits
store 1,20,30,35,,15.0,8,45.0
store 2,15,5,10,50.0,2.0,5,7.0
store 3,20,30,35,4.0,,0,


In [43]:
store_items = store_items.drop(['watches', 'shoes'], axis=1)
store_items

Unnamed: 0,bikes,pants,glasses,shirts,suits
store 1,20,30,,15.0,45.0
store 2,15,5,50.0,2.0,7.0
store 3,20,30,4.0,,


In [44]:
store_items = store_items.drop(['store 1'], axis=0)
store_items

Unnamed: 0,bikes,pants,glasses,shirts,suits
store 2,15,5,50.0,2.0,7.0
store 3,20,30,4.0,,


In [45]:
store_items = store_items.rename(columns={'bikes': 'hats'})
store_items

Unnamed: 0,hats,pants,glasses,shirts,suits
store 2,15,5,50.0,2.0,7.0
store 3,20,30,4.0,,


In [50]:
store_items = store_items.rename(index={'store 3': 'store 2.5'})
store_items

Unnamed: 0,hats,pants,glasses,shirts,suits
store 2,15,5,50.0,2.0,7.0
store 2.5,20,30,4.0,,


# Dealing with NaN

In [65]:
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]

store_items = pd.DataFrame(items2, index=['store 1', 'store 2', 'store 3'])

store_items

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,,10,,4.0


In [54]:
empties = store_items.isnull().sum().sum()

print("NaN Count: ", empties)

NaN Count:  3


In [55]:
print("non-NaN Count: ", store_items.count().sum())

non-NaN Count:  18


In [56]:
store_items.dropna(axis=0)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 2,15,5,10,2.0,5,7.0,50.0


In [64]:
store_items

Unnamed: 0,bikes,pants,watches,shoes
store 1,20,30,35,8
store 2,15,5,10,5
store 3,20,30,35,10


In [58]:
store_items.dropna(axis=1)

Unnamed: 0,bikes,pants,watches,shoes
store 1,20,30,35,8
store 2,15,5,10,5
store 3,20,30,35,10


In [63]:
# store_items_save_drop = store_items.dropna(axis=1, inplace=True)

In [67]:
store_items.fillna(0)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,0.0
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,0.0,10,0.0,4.0


In [70]:
store_items.fillna(method='ffill', axis=0)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,2.0,10,7.0,4.0


In [71]:
store_items.fillna(method='ffill', axis=1)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20.0,30.0,35.0,15.0,8.0,45.0,45.0
store 2,15.0,5.0,10.0,2.0,5.0,7.0,50.0
store 3,20.0,30.0,35.0,35.0,10.0,10.0,4.0


In [74]:
store_items.fillna(method='bfill', axis=1)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20.0,30.0,35.0,15.0,8.0,45.0,
store 2,15.0,5.0,10.0,2.0,5.0,7.0,50.0
store 3,20.0,30.0,35.0,10.0,10.0,4.0,4.0


In [76]:
store_items.interpolate(method='linear', axis=1)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20.0,30.0,35.0,15.0,8.0,45.0,45.0
store 2,15.0,5.0,10.0,2.0,5.0,7.0,50.0
store 3,20.0,30.0,35.0,22.5,10.0,7.0,4.0


# Loading Data into Pandas DataFrames

In [None]:
google_stock = pd.read_csv('./GOOG.csv')

In [78]:
print("google_stock.head(n) gives you the first n rows")

google_stock.head(n) gives you the first n rows


In [79]:
print("google_stock.tail(n) gives you the last n rows")

google_stock.tail(n) gives you the last n rows


In [86]:
print("google_stock.isnull().any() tells you if any columns have NaN values\n")
print("Here's some example output: \n")
print('Date                  False')
print('Open                False')
print('High                  False')
print('Low                   False')
print('Close                 False')
print('Adj Close          False')
print('Volume             False')
print('dtype: bool')
print("\nAll 'False' means there are no NaN values")

google_stock.isnull().any() tells you if any columns have NaN values

Here's some example output: 

Date                  False
Open                False
High                  False
Low                   False
Close                 False
Adj Close          False
Volume             False
dtype: bool

All 'False' means there are no NaN values


In [87]:
print("google_stock.describe() gives us info like the count, mean, std, min, max, and more")

google_stock.describe() gives us info like the count, mean, std, min, max, and more


In [88]:
print("google_stock.describe([column_name]) gives us info like the count, mean, std, min, max, and more for a column")

google_stock.describe([column_name]) gives us info like the count, mean, std, min, max, and more for a column


In [91]:
print("google_stock.corr() will give the correlation between columns")
print("\nA correlation value of 1 tells us there is a high correlation and a correlation of 0 tells us that the data is not correlated.")

google_stock.corr() will give the correlation between columns

A correlation value of 1 tells us there is a high correlation and a correlation of 0 tells us that the data is not correlated.


In [None]:
google_stock.groupby([column_name])