# Wrangling Data with Pandas
https://www.youtube.com/watch?v=XDAnFZqJDvI

In [1]:
import pandas as pd

In [6]:
csv_data = pd.read_csv("iris_train.csv")

In [7]:
csv_data.head()

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,species
0,6.0,2.2,4.0,1.0,Iris-versicolor
1,5.2,3.4,1.4,0.2,Iris-setosa
2,6.9,3.1,5.4,2.1,Iris-virginica
3,7.3,2.9,6.3,1.8,Iris-virginica
4,7.6,3.0,6.6,2.1,Iris-virginica


In [8]:
csv_data.describe(include='all')

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,species
count,100.0,100.0,100.0,100.0,100
unique,,,,,3
top,,,,,Iris-virginica
freq,,,,,36
mean,5.873,3.025,3.89,1.256,
std,0.797832,0.418119,1.725419,0.749588,
min,4.4,2.0,1.0,0.1,
25%,5.175,2.8,1.6,0.3,
50%,5.9,3.0,4.5,1.4,
75%,6.4,3.325,5.1,1.8,


Shuffling the DF:

In [9]:
csv_data = csv_data.sample(frac=1).reset_index(drop=True)

Can use this if your data is already sorted and you want to mix it up.

#### DF access:
Columns

In [10]:
csv_data.columns

Index(['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'species'], dtype='object')

In [11]:
csv_data['sepal_len']

0     7.2
1     4.9
2     6.0
3     5.5
4     4.9
     ... 
95    6.9
96    5.1
97    5.5
98    6.3
99    6.0
Name: sepal_len, Length: 100, dtype: float64

Rows:  
by index location:

In [12]:
csv_data.iloc[5]

sepal_len            4.5
sepal_wid            2.3
petal_len            1.3
petal_wid            0.3
species      Iris-setosa
Name: 5, dtype: object

You may want a particular row or column.  
If you want row 5 of a particular column:

In [13]:
csv_data['sepal_len'].iloc[5]

4.5

What if you want a range of columns?  
You can pass in an array of column names:

In [15]:
csv_data[['sepal_len', 'petal_len']]

Unnamed: 0,sepal_len,petal_len
0,7.2,5.8
1,4.9,3.3
2,6.0,5.0
3,5.5,3.7
4,4.9,1.5
...,...,...
95,6.9,5.4
96,5.1,1.5
97,5.5,4.0
98,6.3,6.0


In [16]:
# Get the column names
cols_2_4 = csv_data.columns[2:4]

In [17]:
# Then get the data columns
csv_data[cols_2_4]

Unnamed: 0,petal_len,petal_wid
0,5.8,1.6
1,3.3,1.0
2,5.0,1.5
3,3.7,1.0
4,1.5,0.1
...,...,...
95,5.4,2.1
96,1.5,0.3
97,4.0,1.3
98,6.0,2.5


What if you want a range of rows?

In [18]:
csv_data.iloc[5:10]

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,species
5,4.5,2.3,1.3,0.3,Iris-setosa
6,5.8,2.7,5.1,1.9,Iris-virginica
7,6.1,3.0,4.6,1.4,Iris-versicolor
8,5.6,2.8,4.9,2.0,Iris-virginica
9,7.7,3.8,6.7,2.2,Iris-virginica


What if you want to select *both* column and rows?

In [25]:
# First, choose the column names:
cols_2_4 = csv_data.columns[2:4]

In [26]:
# Then we get the columns:
cols_2_4DF = csv_data[cols_2_4]

In [27]:
# Now select the rows from that DF:
cols_2_4DF.iloc[5:10]

Unnamed: 0,petal_len,petal_wid
5,1.3,0.3
6,5.1,1.9
7,4.6,1.4
8,4.9,2.0
9,6.7,2.2


Now you can collapse those three lines down into one expression:

In [29]:
csv_data[csv_data.columns[2:4]].iloc[5:10]

Unnamed: 0,petal_len,petal_wid
5,1.3,0.3
6,5.1,1.9
7,4.6,1.4
8,4.9,2.0
9,6.7,2.2
