# Pandas Review

Create pandas from:

**1. python dictionary**

In [1]:
import numpy as np
import pandas as pd

my_dict = {
    'id': [1,2,3,4],
    'name': ['ringo', 'paul', 'john', 'george'],
    'age': [23,24,25,26],
    'role': ['drummer', 'singer', 'singer', 'guitar']
}
df_beatles = pd.DataFrame(my_dict)
df_beatles

Unnamed: 0,id,name,age,role
0,1,ringo,23,drummer
1,2,paul,24,singer
2,3,john,25,singer
3,4,george,26,guitar


**2. python list**

In [2]:
lst = [[1,2,3], ['tom', 'dick', 'harry'], [23,24,25]]
cols = ['id', 'name', 'age']
df_lst = pd.DataFrame(lst, cols)
df_lst

Unnamed: 0,0,1,2
id,1,2,3
name,tom,dick,harry
age,23,24,25


**3. csv file**

General format:

```py
df = pd.read_csv('file_name.csv', nrows=5, header=None, sep=',', comment='#', na_values=[''])
```
The 1st argument, filename(path), is req'd, the others are optional:

- `nrows` number of rows of file to read.
- `header=None` use row number as the column names.
- `sep` delimiter to use, default `sep` is a comma, `,`.
- `comment` character to split comments.
- `na_values` string to regognise as NaN.


In [3]:
df = pd.read_csv('data/test.csv')

## Exploring a Pandas DataFrame

In [4]:
df.index

RangeIndex(start=0, stop=418, step=1)

In [5]:
# return the column headings
df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
# dataframe info - num cols, num rows, num non-num values for each row, col data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [7]:
# return the first x rows, default is 5
df.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [8]:
# return the last 'x' rows, default is 5
df.tail(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [9]:
# return a numpy array of all the values
# dataframe of mixed datatypes results in a numpy array of objects
df_array = df.values
df_array

array([[892, 3, 'Kelly, Mr. James', ..., 7.8292, nan, 'Q'],
       [893, 3, 'Wilkes, Mrs. James (Ellen Needs)', ..., 7.0, nan, 'S'],
       [894, 2, 'Myles, Mr. Thomas Francis', ..., 9.6875, nan, 'Q'],
       ...,
       [1307, 3, 'Saether, Mr. Simon Sivertsen', ..., 7.25, nan, 'S'],
       [1308, 3, 'Ware, Mr. Frederick', ..., 8.05, nan, 'S'],
       [1309, 3, 'Peter, Master. Michael J', ..., 22.3583, nan, 'C']],
      dtype=object)

### Select a Subsection/Row(s) of a Pandas DataFrame or Series

A Series is a 1-D array, use `[start:end]` notation as you do with Python lists.

You can also use the same notation to select a subsection of a dataframe, `start` and `end` referring to rows.

When selecting a single row, use `.iloc`

In [21]:
# select a Single row
df.iloc[205]

PassengerId                         1097
Pclass                                 1
Name           Omont, Mr. Alfred Fernand
Sex                                 male
Age                                  NaN
SibSp                                  0
Parch                                  0
Ticket                        F.C. 12998
Fare                             25.7417
Cabin                                NaN
Embarked                               C
Name: 205, dtype: object

In [24]:
# Select Multiple rows, starting at row 205, upto but not including row 210
df[205: 210] # or df.iloc[205:210]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
205,1097,1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C
206,1098,3,"McGowan, Miss. Katherine",female,35.0,0,0,9232,7.75,,Q
207,1099,2,"Collett, Mr. Sidney C Stuart",male,24.0,0,0,28034,10.5,,S
208,1100,1,"Rosenbaum, Miss. Edith Louise",female,33.0,0,0,PC 17613,27.7208,A11,C
209,1101,3,"Delalic, Mr. Redjo",male,25.0,0,0,349250,7.8958,,S


In [26]:
# starting from 3rd from end, going from back to front(right to left),
# upto but not including row 410 (-8 from back)
df[-3:-8:-1] # or df.iloc[-3:-8:-1]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
412,1304,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.775,,S
411,1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0,C78,Q


### Select a Single Column

In [10]:
# select a column of data - limit return to 1st 5 rows
# returns a pandas 'Series' when selecting a 'Single' column
# select elements same as a python list
df.Name[:5] 

0                                Kelly, Mr. James
1                Wilkes, Mrs. James (Ellen Needs)
2                       Myles, Mr. Thomas Francis
3                                Wirz, Mr. Albert
4    Hirvonen, Mrs. Alexander (Helga E Lindqvist)
Name: Name, dtype: object

In [11]:
# use [''] when key name contains spaces, special chars or starts with a number
df['Name'][5:9]

5                   Svensson, Mr. Johan Cervin
6                         Connolly, Miss. Kate
7                 Caldwell, Mr. Albert Francis
8    Abrahim, Mrs. Joseph (Sophie Halaut Easu)
Name: Name, dtype: object

### Select Multiple Columns

In [15]:
# Use a comma separated list of column names.
df[['Name', 'Age', 'Pclass']][16:21]

Unnamed: 0,Name,Age,Pclass
16,"Keane, Mr. Daniel",35.0,2
17,"Assaf, Mr. Gerios",21.0,3
18,"Ilmakangas, Miss. Ida Livija",27.0,3
19,"Assaf Khalil, Mrs. Mariana (Miriam"")""",45.0,3
20,"Rothschild, Mr. Martin",55.0,1


### Select an individual field/value

In [32]:
# NOTE: rows and columns are ZERO indexed
# Select the field by position, row(0) & column(2), uses 'iloc'
df.iloc[[0], [2]]

Unnamed: 0,Name
0,"Kelly, Mr. James"


In [33]:
# Select the field by Label, uses 'loc'
df.loc[[0], ['Name']]

Unnamed: 0,Name
0,"Kelly, Mr. James"


### Dropping Columns

In [41]:
# NOTE: use 'axis=1' to drop a column, original dataframe is unchanged
# use a comma separated list of one or more column names
# drop a single column from a dataframe
df.drop(['PassengerId', 'SibSp', 'Parch'], axis=1)[200:210]

Unnamed: 0,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked
200,3,"Murphy, Miss. Nora",female,,36568,15.5,,Q
201,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,347080,14.4,,S
202,1,"Astor, Col. John Jacob",male,47.0,PC 17757,227.525,C62 C64,C
203,2,"Quick, Miss. Winifred Vera",female,8.0,26360,26.0,,S
204,2,"Andrew, Mr. Frank Thomas",male,25.0,C.A. 34050,10.5,,S
205,1,"Omont, Mr. Alfred Fernand",male,,F.C. 12998,25.7417,,C
206,3,"McGowan, Miss. Katherine",female,35.0,9232,7.75,,Q
207,2,"Collett, Mr. Sidney C Stuart",male,24.0,28034,10.5,,S
208,1,"Rosenbaum, Miss. Edith Louise",female,33.0,PC 17613,27.7208,A11,C
209,3,"Delalic, Mr. Redjo",male,25.0,349250,7.8958,,S


### Dropping Rows

In [43]:
# NOTE: use 'axis=0' to drop a row, original dataframe remains unchanged
# drop rows based on row number
df.drop([204, 205, 206], axis=0)[200:210]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
200,1092,3,"Murphy, Miss. Nora",female,,0,0,36568,15.5,,Q
201,1093,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S
202,1094,1,"Astor, Col. John Jacob",male,47.0,1,0,PC 17757,227.525,C62 C64,C
203,1095,2,"Quick, Miss. Winifred Vera",female,8.0,1,1,26360,26.0,,S
207,1099,2,"Collett, Mr. Sidney C Stuart",male,24.0,0,0,28034,10.5,,S
208,1100,1,"Rosenbaum, Miss. Edith Louise",female,33.0,0,0,PC 17613,27.7208,A11,C
209,1101,3,"Delalic, Mr. Redjo",male,25.0,0,0,349250,7.8958,,S
210,1102,3,"Andersen, Mr. Albert Karvin",male,32.0,0,0,C 4001,22.525,,S
211,1103,3,"Finoli, Mr. Luigi",male,,0,0,SOTON/O.Q. 3101308,7.05,,S
212,1104,2,"Deacon, Mr. Percy William",male,17.0,0,0,S.O.C. 14879,73.5,,S


### Selecting a subset based on logic

In [59]:
df[(df.Age > 60) & (df.Sex == 'female')]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
96,988,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1,0,19877,78.85,C46,S
114,1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63.0,1,0,PC 17483,221.7792,C55 C57,S
179,1071,1,"Compton, Mrs. Alexander Taylor (Mary Eliza Ing...",female,64.0,0,2,PC 17756,83.1583,E45,C
305,1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabe...",female,64.0,1,1,112901,26.55,B26,S


In [60]:
df[(df.Age > 55) & (df.Age < 60) & ((df.Pclass == 1) | (df.Pclass == 2)) ]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
217,1109,1,"Wick, Mr. George Dennick",male,57.0,1,1,36928,164.8667,,S
316,1208,1,"Spencer, Mr. William Augustus",male,57.0,1,0,PC 17569,146.5208,B78,C
343,1235,1,"Cardeza, Mrs. James Warburton Martinez (Charlo...",female,58.0,0,1,PC 17755,512.3292,B51 B53 B55,C
356,1248,1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59.0,2,0,11769,51.4792,C101,S
387,1279,2,"Ashby, Mr. John",male,57.0,0,0,244346,13.0,,S


When we select a subset of a DataFrame using logic, we end up with non-consecutive indices. We can fix this using the method `.reset_index()`.

By default, a new `index` column is created with the old indicies and and the indicies reset. You can avoid the `index` column being created by using the `drop=True` option.

In [69]:
df[(df.Age > 40) & (df.Age < 45) & (df.Pclass == 1)].reset_index(drop=True)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,920,1,"Brady, Mr. John Bertram",male,41.0,0,0,113054,30.5,A21,S
1,992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Mo...",female,43.0,1,0,11778,55.4417,C116,C
2,1036,1,"Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lin...",male,42.0,0,0,17475,26.55,,S
3,1050,1,"Borebank, Mr. John James",male,42.0,0,0,110489,26.55,D22,S
4,1107,1,"Head, Mr. Christopher",male,42.0,0,0,113038,42.5,B11,S
5,1137,1,"Kenyon, Mr. Frederick R",male,41.0,1,0,17464,51.8625,D21,S
6,1296,1,"Frauenthal, Mr. Isaac Gerald",male,43.0,1,0,17765,27.7208,D40,C
