# What is Pandas?

It is a high-level data manipulation tool. They are numpy array in which the rows and columns are identified with labels. 

In [6]:
import numpy as np
import pandas as pd

## The Pandas Series Object

A panda series is a 1-d array of indexed data.

In [7]:
df = pd.Series([0,1,2,3])

In [8]:
df

0    0
1    1
2    2
3    3
dtype: int64

In [9]:
df[0]

0

In [10]:
df[3]

3

In [11]:
df.values

array([0, 1, 2, 3])

In [12]:
df.index = ['a', 'b', 'c', 'd']

In [13]:
df

a    0
b    1
c    2
d    3
dtype: int64

In [14]:
df['a']

0

### Example: Say we want to upload the following data into a panda series:

Course     -----------      number of students 

precalc     -----------      20 

calc1     -----------      30

calc2      -----------      27

calc3      -----------      15

In [15]:
df = pd.Series([20, 30, 27, 15] , index = ['precalc', 'calc1', 'calc2', 'calc3'])

In [16]:
df

precalc    20
calc1      30
calc2      27
calc3      15
dtype: int64

In [17]:
df['precalc']

20

In [18]:
df['calc1']

30

## The Pandas DataFrame Object

### Example: Say we want to upload the following data into a panda DataFrame:

Course     -----------      number of students ----------- Number of girls in class

precalc     -----------      20 -----------  10

calc1     -----------      30 ----------- 20

calc2      -----------      27  ----------- 10

calc3      -----------      15 ----------- 12

In [19]:
course = ['precalc', 'calc1', 'calc2', 'calc3']
n_students = [20, 30, 27, 15]
n_girls = [10, 20, 10, 12]
df = pd.DataFrame({'course': course, 'number of students':n_students , 'Number of girls': n_girls})

In [20]:
df

Unnamed: 0,course,number of students,Number of girls
0,precalc,20,10
1,calc1,30,20
2,calc2,27,10
3,calc3,15,12


In [21]:
df.columns

Index(['course', 'number of students', 'Number of girls'], dtype='object')

In [22]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [23]:
df['course']

0    precalc
1      calc1
2      calc2
3      calc3
Name: course, dtype: object

In [24]:
df['number of students']

0    20
1    30
2    27
3    15
Name: number of students, dtype: int64

In [25]:
df.iloc[2,2]

10

In [26]:
df.iloc[:, :]

Unnamed: 0,course,number of students,Number of girls
0,precalc,20,10
1,calc1,30,20
2,calc2,27,10
3,calc3,15,12


In [27]:
df.iloc[0, :]

course                precalc
number of students         20
Number of girls            10
Name: 0, dtype: object

In [28]:
df

Unnamed: 0,course,number of students,Number of girls
0,precalc,20,10
1,calc1,30,20
2,calc2,27,10
3,calc3,15,12


### We now want to add another column to the table that indicates the number of guys. How?

In [29]:
df.columns

Index(['course', 'number of students', 'Number of girls'], dtype='object')

In [30]:
n_guys = df['number of students'] - df['Number of girls']

In [31]:
n_guys

0    10
1    10
2    17
3     3
dtype: int64

In [32]:
df['number of guys'] = n_guys

In [33]:
df

Unnamed: 0,course,number of students,Number of girls,number of guys
0,precalc,20,10,10
1,calc1,30,20,10
2,calc2,27,10,17
3,calc3,15,12,3


In [34]:
df.values

array([['precalc', 20, 10, 10],
       ['calc1', 30, 20, 10],
       ['calc2', 27, 10, 17],
       ['calc3', 15, 12, 3]], dtype=object)

In [35]:
df.values[0]

array(['precalc', 20, 10, 10], dtype=object)

### We realize that the number of students in precalc is 21 not 20. How can we change that value in the table?

In [40]:
df.iloc[0, 1] = 21

In [41]:
df

Unnamed: 0,course,number of students,Number of girls,number of guys
0,precalc,21,10,10
1,calc1,30,20,10
2,calc2,27,10,17
3,calc3,15,12,3


In [42]:
df.iloc[0, 3] = 11

In [43]:
df

Unnamed: 0,course,number of students,Number of girls,number of guys
0,precalc,21,10,11
1,calc1,30,20,10
2,calc2,27,10,17
3,calc3,15,12,3


In [48]:
df.iloc[0, 3] = None

In [49]:
df

Unnamed: 0,course,number of students,Number of girls,number of guys
0,precalc,21,10,
1,calc1,30,20,10.0
2,calc2,27,10,17.0
3,calc3,15,12,3.0


In [50]:
df

Unnamed: 0,course,number of students,Number of girls,number of guys
0,precalc,21,10,
1,calc1,30,20,10.0
2,calc2,27,10,17.0
3,calc3,15,12,3.0


In [51]:
df.isnull()

Unnamed: 0,course,number of students,Number of girls,number of guys
0,False,False,False,True
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False


In [68]:
df.isnull().sum()

course                0
number of students    0
Number of girls       0
number of guys        1
dtype: int64

In [69]:
sum(df.isnull().sum())

1

In [70]:
df.dropna()

Unnamed: 0,course,number of students,Number of girls,number of guys
1,calc1,30,20,10.0
2,calc2,27,10,17.0
3,calc3,15,12,3.0


In [72]:
df

Unnamed: 0,course,number of students,Number of girls,number of guys
0,precalc,21,10,
1,calc1,30,20,10.0
2,calc2,27,10,17.0
3,calc3,15,12,3.0
