In [2]:
import numpy as np
import pandas as pd


# aliasing helps avoid pandas stepping on other functions
# methods will play nice with each other that way

#### Fundamental data structures

- Series: one-dimensional objects similar to an array or column in table
- DataFrame: two-dimensional object (excel)

### Series

In [3]:
s1 = pd.Series([20, 'hello', 3.1])
s1

0       20
1    hello
2      3.1
dtype: object

Series can be created from a dictionary using its keys as the index.

In [7]:
d = {'a': 1, 'b': 2}
s2 = pd.Series(d)
s2

a    1
b    2
dtype: int64

You can use indices to get values. 

In [8]:
s2['b'] # the value whose index is 'b'

2

### DataFrame

pd.dataframe() is for making a DataFrame. 

In [10]:
# Reminder: df.copy makes an entirely new variable

df1 = pd.DataFrame([2,2], [2,2])
df2 = df1
df3 = df.copy 

Unnamed: 0,0
2,2
2,2


### Applying Functions Across Rows/Columns


In [17]:
# lambda is an anonymous function, so it doesn't exist in the environment

df = pd.DataFrame(np.random.randint(1, 7, size = (3,2)), columns = ['Amy', 'Ben'])
print(df)

df.apply(lambda x: x.max() - x.min())

# applymap() apply to all elements 

   Amy  Ben
0    1    4
1    1    5
2    1    2


Amy    0
Ben    3
dtype: int64

Generally, apply functions help you do operations with vectors as opposed to single values. Vectorizing things is super useful for speed and performance.

#### Summarizing Data

In [19]:
astronauts = pd.read_csv('./astronauts.csv')

In [25]:
astronauts_female = astronauts[astronauts['gender'] == 'female']
astronauts_female

Unnamed: 0,name,gender,birth,nationality,military_civilian,mission_number,occupation,year_of_mission,mission_hours,mission_title
17,"Tereshkova, Valentina",female,1937,U.S.S.R/Russia,military,1,pilot,1963,70.83,Vostok 6
214,"Savitskaya, Svetlana",female,1948,U.S.S.R/Russia,civilian,1,MSP,1982,189.00,Salyut 7
215,"Savitskaya, Svetlana",female,1948,U.S.S.R/Russia,civilian,2,flight engineer,1984,283.00,Salyut 7
240,"Ride, Sally K.",female,1951,U.S.,civilian,1,MSP,1983,147.00,STS-7
241,"Ride, Sally K.",female,1951,U.S.,civilian,2,MSP,1984,197.00,STS-41-G
...,...,...,...,...,...,...,...,...,...,...
1259,"Rubins, Kathleen",female,1978,U.S.,civilian,1,flight engineer,2016,6902.35,48
1268,"Auñón-Chancellor, Serena",female,1976,U.S.,civilian,1,flight engineer,2018,4722.00,56
1272,"McClain, Anne Charlotte",female,1979,U.S.,military,1,flight engineer,2018,4887.00,57
1273,"Koch, Christina",female,1979,U.S.,civilian,1,flight engineer,2019,7372.30,59


In [27]:
# re-map values

astronauts['female'] = astronauts['gender'].map({'female': 1, 'male': 0})

In [29]:
# run multiple summary statistics 

astronauts['age_at_mission'] = astronauts['year_of_mission'] - astronauts['birth']

astronauts[['gender', 'age_at_mission']].groupby('gender').agg(['min', 'max', 'mean'])


Unnamed: 0_level_0,age_at_mission,age_at_mission,age_at_mission
Unnamed: 0_level_1,min,max,mean
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,26,56,40.685315
male,26,77,43.194885


In [34]:
astronauts[['gender', 'occupation', 'mission_hours']]
#OR
three_vars = ['gender', 'occupation', 'mission_hours']
astronauts[three_vars]

Unnamed: 0,gender,occupation,mission_hours
0,male,pilot,1.77
1,male,pilot,25.00
2,male,pilot,5.00
3,male,pilot,213.00
4,male,pilot,5.00
...,...,...,...
1272,female,flight engineer,4887.00
1273,female,flight engineer,7372.30
1274,male,flight engineer,4303.08
1275,female,flight engineer,2697.60


Don't get too caught up in iloc stuff, just pick columns and think of it as R for the most part.

astronauts['year_of_mission'] = astronauts$year_of_mission

Using the brackets within the iloc makes it a list!!!! So think of it as supplying a vector

#### Merging Dataframes

In [None]:
# merge(), finding indices where they fit together

**REVIEW DIFFERENT KINDS OF JOINS**

In [38]:
country = pd.read_csv('./country.csv')

astronauts_nationality = astronauts[['name', 'gender', 'birth', 'nationality']].drop_duplicates() 
astronauts_continet = pd.merge(astronauts_nationality, country,
                              how = 'left', left_on = 'nationality', right_on = 'country')

astronauts_continet

Unnamed: 0,name,gender,birth,nationality,country,continent
0,"Gagarin, Yuri",male,1934,U.S.S.R/Russia,U.S.S.R/Russia,Europe
1,"Titov, Gherman",male,1935,U.S.S.R/Russia,U.S.S.R/Russia,Europe
2,"Glenn, John H., Jr.",male,1921,U.S.,U.S.,North America
3,"Carpenter, M. Scott",male,1925,U.S.,U.S.,North America
4,"Nikolayev, Andriyan",male,1929,U.S.S.R/Russia,U.S.S.R/Russia,Europe
...,...,...,...,...,...,...
560,"McClain, Anne Charlotte",female,1979,U.S.,U.S.,North America
561,"Koch, Christina",female,1979,U.S.,U.S.,North America
562,"Morgan, Andrew",male,1976,U.S.,U.S.,North America
563,"Meir, Jessica",female,1977,U.S.,U.S.,North America


Unnamed: 0,country,continent
0,Armenia,Asia
1,Algeria,Africa
2,U.S.S.R/Russia,Europe
3,U.S.,North America
4,Mongolia,Asia
5,Romania,Europe
6,France,Europe
7,Czechoslovakia,Europe
8,Poland,Europe
9,Germany,Europe
