In [3]:
####################################
### Pandas Crash Course ###########
##################################
# We'll use pandas more than numpy in the Course
# so let's quickly go over a few main ideas with pandas!
import pandas as pd
import numpy as np

df = pd.read_csv('salaries.csv')
df

Unnamed: 0,Name,Salary,Age
0,John,50000,34
1,Sally,120000,45
2,Alyssa,80000,27


In [4]:
# You can select columns with a bracket call:
df['Name']

0      John
1     Sally
2    Alyssa
Name: Name, dtype: object

In [5]:
df['Salary']

0     50000
1    120000
2     80000
Name: Salary, dtype: int64

In [6]:
# Select multiple columns with a list of column names.
# Since you are passing in a list, you see two sets of []
print(df[['Name','Salary']])

     Name  Salary
0    John   50000
1   Sally  120000
2  Alyssa   80000


In [11]:
# Similar to NumPy, you can create calls of min(), max(), mean(), etc...on a pandas dataframe.
print(df['Salary'].mean())
df['Age'].mean()

83333.33333333333


35.333333333333336

In [13]:
# Just like Numpy, we can use conditional filtering to select rows that meet
# certain critera. Like choosing rows where the Age value is greater than 30
filt = df['Age'] > 30
print(filt)

0     True
1     True
2    False
Name: Age, dtype: bool


In [16]:
# Pass it to the dataframe
print(df[filt])

    Name  Salary  Age
0   John   50000   34
1  Sally  120000   45


In [18]:
# More commonly done all in one step:
df[df['Age'] > 30]

Unnamed: 0,Name,Salary,Age
0,John,50000,34
1,Sally,120000,45


In [19]:
# list of unique values for Age
df['Age'].unique()

array([34, 45, 27], dtype=int64)

In [20]:
# number of unqiue values
df['Age'].nunique()

3

In [22]:
# General info about your dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
Name      3 non-null object
Salary    3 non-null int64
Age       3 non-null int64
dtypes: int64(2), object(1)
memory usage: 152.0+ bytes


In [23]:
# Statistics about your dataframe
df.describe() 

Unnamed: 0,Salary,Age
count,3.0,3.0
mean,83333.333333,35.333333
std,35118.845843,9.073772
min,50000.0,27.0
25%,65000.0,30.5
50%,80000.0,34.0
75%,100000.0,39.5
max,120000.0,45.0


In [24]:
# Grab a list of all columns
df.columns 

Index(['Name', 'Salary', 'Age'], dtype='object')

In [25]:
# Create an index list
df.index

RangeIndex(start=0, stop=3, step=1)

In [27]:
# You can convert a numpy matrix to a dataframe with:
mat = np.arange(50).reshape(5,10)
pd.DataFrame(mat)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
