# Pandas Basics

### Creating pandas dataframes

In [1]:
import pandas as pd
import numpy as np

# 1. using dicts
dic = {
    'name': ['Tom Jones', 'Dick Jones', 'Harry Jones'],
    'age': [23,34,45],
    'employed': [True, False, True]
}
df = pd.DataFrame(dic)
df

Unnamed: 0,name,age,employed
0,Tom Jones,23,True
1,Dick Jones,34,False
2,Harry Jones,45,True


In [2]:
# apply custom index
df.index = list('xyz')
df

Unnamed: 0,name,age,employed
x,Tom Jones,23,True
y,Dick Jones,34,False
z,Harry Jones,45,True


In [3]:
# 2. using ndarray or 2D python list
ndarray = np.array([
    list('123456'), # each list represents a row, must be same length
    list('abcdef'),
    list('qurstu')
])
names = ['ab','bc', 'cd', 'de', 'ef', 'fg']

pd.DataFrame(ndarray, columns=names)

Unnamed: 0,ab,bc,cd,de,ef,fg
0,1,2,3,4,5,6
1,a,b,c,d,e,f
2,q,u,r,s,t,u


In [4]:
# apply a custom index col
pd.DataFrame(ndarray, columns=names, index=list('xyz'))

Unnamed: 0,ab,bc,cd,de,ef,fg
x,1,2,3,4,5,6
y,a,b,c,d,e,f
z,q,u,r,s,t,u


In [5]:
# 3. reading a file
pd.read_csv('../data/cars.csv', index_col=0)

Unnamed: 0,cars_per_cap,country,drives_right
US,809,United States,True
AUS,731,Australia,False
JAP,588,Japan,False
IN,18,India,False
RU,200,Russia,True
MOR,70,Morocco,True
EG,45,Egypt,True


In [6]:
df = pd.read_csv('../data/percent-bachelors-degrees-women-usa.csv')
print(df.shape)
df.head().T

(42, 18)


Unnamed: 0,0,1,2,3,4
Year,1970.0,1971.0,1972.0,1973.0,1974.0
Agriculture,4.229798,5.452797,7.42071,9.653602,14.074623
Architecture,11.921005,12.003106,13.214594,14.791613,17.444688
Art and Performance,59.7,59.9,60.4,60.2,61.9
Biology,29.088363,29.394403,29.810221,31.147915,32.996183
Business,9.064439,9.503187,10.558962,12.804602,16.20485
Communications and Journalism,35.3,35.5,36.6,38.4,40.5
Computer Science,13.6,13.6,14.9,16.4,18.9
Education,74.535328,74.149204,73.55452,73.501814,73.336811
Engineering,0.8,1.0,1.2,1.6,2.2


**Selecting rows based on position**

Use `iloc` method to select rows from a dataframe.

In [7]:
df.iloc[-28:-23] # similar to slicing lists

Unnamed: 0,Year,Agriculture,Architecture,Art and Performance,Biology,Business,Communications and Journalism,Computer Science,Education,Engineering,English,Foreign Languages,Health Professions,Math and Statistics,Physical Sciences,Psychology,Public Administration,Social Sciences and History
14,1984,31.092947,35.453083,62.1,47.669083,45.12403,59.1,36.8,75.869116,13.5,65.749862,72.1,85.1,46.2,28.0,68.2,75.9,44.1
15,1985,31.379659,36.133348,61.8,47.909884,45.747782,59.0,35.7,75.92344,13.5,65.798199,70.8,85.3,46.5,27.5,69.0,75.0,43.8
16,1986,31.198719,37.240223,62.1,48.300678,46.532915,60.0,34.7,76.143015,13.9,65.982561,71.2,85.7,46.7,28.4,69.0,75.7,44.0
17,1987,31.486429,38.730675,61.7,50.209878,46.690466,60.2,32.4,76.963092,14.0,66.706031,72.0,85.5,46.5,30.4,70.1,76.4,43.9
18,1988,31.085087,39.398907,61.7,50.099811,46.764828,60.4,30.8,77.627662,13.9,67.144498,72.3,85.2,46.2,29.7,70.9,75.6,44.4


In [8]:
df[['Biology', 'Education', 'English', 'Business']].iloc[10:15]

Unnamed: 0,Biology,Education,English,Business
10,43.999257,74.981032,65.28413,36.765725
11,45.249512,75.845123,65.838322,39.26623
12,45.967338,75.843649,65.847352,41.949373
13,46.713135,75.950601,65.91838,43.54207
14,47.669083,75.869116,65.749862,45.12403


**Selecting rows based on Conditional Logic**

In [9]:
df['Education'] > 79 # returns a 'mask' of 'True'/'False' values

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35     True
36    False
37    False
38     True
39     True
40     True
41     True
Name: Education, dtype: bool

In [10]:
df[df['Education'] > 79] # only return the 'True' values

Unnamed: 0,Year,Agriculture,Architecture,Art and Performance,Biology,Business,Communications and Journalism,Computer Science,Education,Engineering,English,Foreign Languages,Health Professions,Math and Statistics,Physical Sciences,Psychology,Public Administration,Social Sciences and History
35,2005,47.672754,43.100368,61.4,61.500984,49.791851,63.4,20.6,79.067122,17.9,68.571221,69.9,86.0,45.1,41.6,77.5,81.2,50.0
38,2008,47.570834,42.71173,60.7,59.305765,48.888027,62.4,17.8,79.196327,16.5,67.594028,70.2,85.2,43.3,40.7,77.2,81.7,49.4
39,2009,48.667224,43.348921,61.0,58.489583,48.840474,62.8,18.1,79.532909,16.8,67.969792,69.3,85.1,43.3,40.7,77.1,82.0,49.4
40,2010,48.730042,42.066721,61.3,59.010255,48.757988,62.5,17.6,79.618625,17.2,67.928106,69.0,85.0,43.1,40.2,77.0,81.7,49.3
41,2011,50.037182,42.773438,61.2,58.742397,48.180418,62.2,18.2,79.432812,17.5,68.42673,69.5,84.8,43.1,40.1,76.7,81.9,49.2


In [11]:
(df['Education'] > 73) & (df['Business'] < 35)

0      True
1      True
2      True
3      True
4      True
5     False
6     False
7     False
8      True
9      True
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
dtype: bool

In [12]:
df[['Business', 'Education', 'Biology', 'Agriculture']][(df['Education'] > 73) & (df['Business'] < 35)]

Unnamed: 0,Business,Education,Biology,Agriculture
0,9.064439,74.535328,29.088363,4.229798
1,9.503187,74.149204,29.394403,5.452797
2,10.558962,73.55452,29.810221,7.42071
3,12.804602,73.501814,31.147915,9.653602
4,16.20485,73.336811,32.996183,14.074623
8,30.527519,73.192821,40.112496,27.146192
9,33.621634,73.821142,42.065551,29.633365


In [13]:
names = ['Cars', 'Country', 'RHD']
df = pd.read_csv('../data/cars2.csv', index_col=0, header=None, names=names)
df

Unnamed: 0,Cars,Country,RHD
UK,506,United Kingdom,True
FR,340,France,False
PT,200,Portugal,False
DE,320,Germany,False
BE,222,Belgium,False


In [14]:
df['Country'].isin(['France', 'Belgium'])

UK    False
FR     True
PT    False
DE    False
BE     True
Name: Country, dtype: bool

In [15]:
df[df['Country'].isin(['France', 'Belgium'])]

Unnamed: 0,Cars,Country,RHD
FR,340,France,False
BE,222,Belgium,False


**Reset Indices**

In [16]:
df = pd.read_csv('../data/percent-bachelors-degrees-women-usa.csv')
df[(df['Education'] > 79) | (df['Biology'] < 32)]

Unnamed: 0,Year,Agriculture,Architecture,Art and Performance,Biology,Business,Communications and Journalism,Computer Science,Education,Engineering,English,Foreign Languages,Health Professions,Math and Statistics,Physical Sciences,Psychology,Public Administration,Social Sciences and History
0,1970,4.229798,11.921005,59.7,29.088363,9.064439,35.3,13.6,74.535328,0.8,65.570923,73.8,77.1,38.0,13.8,44.4,68.4,36.8
1,1971,5.452797,12.003106,59.9,29.394403,9.503187,35.5,13.6,74.149204,1.0,64.556485,73.9,75.5,39.0,14.9,46.2,65.5,36.2
2,1972,7.42071,13.214594,60.4,29.810221,10.558962,36.6,14.9,73.55452,1.2,63.664263,74.6,76.9,40.2,14.8,47.6,62.6,36.1
3,1973,9.653602,14.791613,60.2,31.147915,12.804602,38.4,16.4,73.501814,1.6,62.941502,74.9,77.4,40.9,16.5,50.4,64.3,36.4
35,2005,47.672754,43.100368,61.4,61.500984,49.791851,63.4,20.6,79.067122,17.9,68.571221,69.9,86.0,45.1,41.6,77.5,81.2,50.0
38,2008,47.570834,42.71173,60.7,59.305765,48.888027,62.4,17.8,79.196327,16.5,67.594028,70.2,85.2,43.3,40.7,77.2,81.7,49.4
39,2009,48.667224,43.348921,61.0,58.489583,48.840474,62.8,18.1,79.532909,16.8,67.969792,69.3,85.1,43.3,40.7,77.1,82.0,49.4
40,2010,48.730042,42.066721,61.3,59.010255,48.757988,62.5,17.6,79.618625,17.2,67.928106,69.0,85.0,43.1,40.2,77.0,81.7,49.3
41,2011,50.037182,42.773438,61.2,58.742397,48.180418,62.2,18.2,79.432812,17.5,68.42673,69.5,84.8,43.1,40.1,76.7,81.9,49.2


In [17]:
df[(df['Education'] > 79) | (df['Biology'] < 32)].reset_index(drop=True)

Unnamed: 0,Year,Agriculture,Architecture,Art and Performance,Biology,Business,Communications and Journalism,Computer Science,Education,Engineering,English,Foreign Languages,Health Professions,Math and Statistics,Physical Sciences,Psychology,Public Administration,Social Sciences and History
0,1970,4.229798,11.921005,59.7,29.088363,9.064439,35.3,13.6,74.535328,0.8,65.570923,73.8,77.1,38.0,13.8,44.4,68.4,36.8
1,1971,5.452797,12.003106,59.9,29.394403,9.503187,35.5,13.6,74.149204,1.0,64.556485,73.9,75.5,39.0,14.9,46.2,65.5,36.2
2,1972,7.42071,13.214594,60.4,29.810221,10.558962,36.6,14.9,73.55452,1.2,63.664263,74.6,76.9,40.2,14.8,47.6,62.6,36.1
3,1973,9.653602,14.791613,60.2,31.147915,12.804602,38.4,16.4,73.501814,1.6,62.941502,74.9,77.4,40.9,16.5,50.4,64.3,36.4
4,2005,47.672754,43.100368,61.4,61.500984,49.791851,63.4,20.6,79.067122,17.9,68.571221,69.9,86.0,45.1,41.6,77.5,81.2,50.0
5,2008,47.570834,42.71173,60.7,59.305765,48.888027,62.4,17.8,79.196327,16.5,67.594028,70.2,85.2,43.3,40.7,77.2,81.7,49.4
6,2009,48.667224,43.348921,61.0,58.489583,48.840474,62.8,18.1,79.532909,16.8,67.969792,69.3,85.1,43.3,40.7,77.1,82.0,49.4
7,2010,48.730042,42.066721,61.3,59.010255,48.757988,62.5,17.6,79.618625,17.2,67.928106,69.0,85.0,43.1,40.2,77.0,81.7,49.3
8,2011,50.037182,42.773438,61.2,58.742397,48.180418,62.2,18.2,79.432812,17.5,68.42673,69.5,84.8,43.1,40.1,76.7,81.9,49.2


In [27]:
df_tools = pd.DataFrame([
  [1, '3 inch screw', 0.5, 0.75],
  [2, '2 inch nail', 0.10, 0.25],
  [3, 'hammer', 3.00, 5.50],
  [4, 'screwdriver', 2.50, 3.00]
],
  columns=['Product ID', 'Description', 'Cost to Manufacture', 'Price']
)
df_tools

Unnamed: 0,Product ID,Description,Cost to Manufacture,Price
0,1,3 inch screw,0.5,0.75
1,2,2 inch nail,0.1,0.25
2,3,hammer,3.0,5.5
3,4,screwdriver,2.5,3.0


In [28]:
df_tools['isTaxed'] = [True, False, False, True]
df_tools['Rate'] = 0.2

df_tools

Unnamed: 0,Product ID,Description,Cost to Manufacture,Price,isTaxed,Rate
0,1,3 inch screw,0.5,0.75,True,0.2
1,2,2 inch nail,0.1,0.25,False,0.2
2,3,hammer,3.0,5.5,False,0.2
3,4,screwdriver,2.5,3.0,True,0.2


In [29]:
df_tools['Tax'] = df_tools.apply(lambda x: x.Price * x.Rate if x.isTaxed else 0, axis=1)
df_tools['Total'] = df_tools['Price'] + df_tools['Tax']
df_tools

Unnamed: 0,Product ID,Description,Cost to Manufacture,Price,isTaxed,Rate,Tax,Total
0,1,3 inch screw,0.5,0.75,True,0.2,0.15,0.9
1,2,2 inch nail,0.1,0.25,False,0.2,0.0,0.25
2,3,hammer,3.0,5.5,False,0.2,0.0,5.5
3,4,screwdriver,2.5,3.0,True,0.2,0.6,3.6
