# What is pandas?

In [2]:
import pandas as pd
print(pd.__doc__)


pandas - a powerful data analysis and manipulation library for Python

See http://pandas.pydata.org/ for full documentation. Otherwise, see the
docstrings of the various objects in the pandas namespace:

Series
DataFrame
Panel
Index
DatetimeIndex
HDFStore
bdate_range
date_range
read_csv
read_fwf
read_table
ols



# The most important thing: DataFrames

In [3]:
namesAges = [
    ["Sally", 6], 
    ["Joe", 10],
    ["John", 7],
    ["Kelly", 8]
] # just a list of lists

df = pd.DataFrame(
    data = namesAges, 
    columns = ["Names", "Ages"]
)

In [4]:
df

Unnamed: 0,Names,Ages
0,Sally,6
1,Joe,10
2,John,7
3,Kelly,8


# Looking inside the DataFrame

## Rows and columns

In [5]:
df.columns

Index(['Names', 'Ages'], dtype='object')

In [6]:
df.index

RangeIndex(start=0, stop=4, step=1)

## Iterating

In [7]:
for column in df:
    print(column)

Names
Ages


## Indexing

In [8]:
df["Names"]

0    Sally
1      Joe
2     John
3    Kelly
Name: Names, dtype: object

In [9]:
df["Ages"]

0     6
1    10
2     7
3     8
Name: Ages, dtype: int64

## df.loc[row, col]

Accesses rows according to the ```df.index``` and columns according to ```df.columns```

In [10]:
df.loc[1]

Names    Joe
Ages      10
Name: 1, dtype: object

In [11]:
df.loc[2]

Names    John
Ages        7
Name: 2, dtype: object

In [12]:
df.loc[2, "Names"]

'John'

In [13]:
df.loc[2, "Ages"]

7

## df.iloc[rowInt, columnInt]

Same as loc, but it's all integer indexing.

In [14]:
df.iloc[2, 0]

'John'

In [15]:
df.iloc[2, 1]

7

## df.shape and size

In [16]:
df.shape

(4, 2)

In [17]:
df.size

8

# Changing values in a pd.DataFrame

In [18]:
df.loc[2, "Ages"]

7

In [19]:
df.loc[2, "Ages"] += 10

In [20]:
df

Unnamed: 0,Names,Ages
0,Sally,6
1,Joe,10
2,John,17
3,Kelly,8


In [21]:
df.insert(loc = 2, value = ["Math", "Music", "Science", "Geography"], column = "Favorite Class")

In [22]:
df

Unnamed: 0,Names,Ages,Favorite Class
0,Sally,6,Math
1,Joe,10,Music
2,John,17,Science
3,Kelly,8,Geography


# Converting into np.arrays

In [39]:
df.values

array([['Sally', 6, 'Math'],
       ['Joe', 10, 'Music'],
       ['John', 17, 'Science'],
       ['Kelly', 8, 'Geography']], dtype=object)

In [40]:
df["Names"].values

array(['Sally', 'Joe', 'John', 'Kelly'], dtype=object)

## Saving as csv

In [23]:
df.to_csv("NamesAndAges.csv")

In [24]:
import os
os.listdir('.') # Can you find it in your current directory?

['.ipynb_checkpoints',
 'boston.csv',
 'Introduction to Numpy.ipynb',
 'Keras.ipynb',
 'Matplotlib.ipynb',
 'NamesAndAges.csv',
 'pandas.ipynb',
 'Sklearn.ipynb',
 'Slides.pptx']

# Opening csvs!

In [25]:
df2 = pd.read_csv("boston.csv") # This csv is in the current folder, so it's just that easy to open!
df2.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PT,B,LSTAT,MV
0,0.00632,18.0,2.31,0,0.538,6.575,65.199997,4.09,1,296,15.3,396.899994,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.900002,4.9671,2,242,17.799999,396.899994,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.099998,4.9671,2,242,17.799999,392.829987,4.03,34.700001
3,0.03237,0.0,2.18,0,0.458,6.998,45.799999,6.0622,3,222,18.700001,394.630005,2.94,33.400002
4,0.06905,0.0,2.18,0,0.458,7.147,54.200001,6.0622,3,222,18.700001,396.899994,5.33,36.200001


The ```df.head()``` method returns the top 5 rows of the dataframe, so we can have a look at the dataset without having to scroll very much

# Some tougher stuff

```df.sort()```

In [26]:
df2.sort("CRIM", ascending = True)

  """Entry point for launching an IPython kernel.


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PT,B,LSTAT,MV
0,0.006320,18.0,2.31,0,0.5380,6.575,65.199997,4.0900,1,296,15.300000,396.899994,4.980000,24.000000
284,0.009060,90.0,2.97,0,0.4000,7.088,20.799999,7.3073,1,285,15.300000,394.720001,7.850000,32.200001
285,0.010960,55.0,2.25,0,0.3890,6.453,31.900000,7.3073,1,300,15.300000,394.720001,8.230000,22.000000
341,0.013010,35.0,1.52,0,0.4420,7.241,49.299999,7.0379,1,284,15.500000,394.739990,5.490000,32.700001
55,0.013110,90.0,1.22,0,0.4030,7.249,21.900000,8.6966,5,226,17.900000,395.929993,4.810000,35.400002
54,0.013600,75.0,4.00,0,0.4100,5.888,47.599998,7.3197,3,469,21.100000,396.899994,14.800000,18.900000
195,0.013810,80.0,0.46,0,0.4220,7.875,32.000000,5.6484,4,255,14.400000,394.230011,2.970000,50.000000
57,0.014320,100.0,1.32,0,0.4110,6.816,40.500000,8.3248,5,256,15.100000,392.899994,3.950000,31.600000
194,0.014390,60.0,2.93,0,0.4010,6.604,18.799999,6.2196,1,265,15.600000,376.700012,4.380000,29.100000
348,0.015010,80.0,2.01,0,0.4350,6.635,29.700001,8.3440,4,280,17.000000,390.940002,5.990000,24.500000


In [27]:
df2["DIS"] *= 1000

In [33]:
df2.loc[:, "DIS":].head()

Unnamed: 0,DIS,RAD,TAX,PT,B,LSTAT,MV
0,4090.000153,1,296,15.3,396.899994,4.98,24.0
1,4967.100143,2,242,17.799999,396.899994,9.14,21.6
2,4967.100143,2,242,17.799999,392.829987,4.03,34.700001
3,6062.200069,3,222,18.700001,394.630005,2.94,33.400002
4,6062.200069,3,222,18.700001,396.899994,5.33,36.200001


Notice the change in the ```"DIS"``` column

We can use dtype to see the datatypes of each of the columns

In [34]:
for column in df2.columns:
    print(column, df2[column].dtype)

CRIM float64
ZN float64
INDUS float64
CHAS int64
NOX float64
RM float64
AGE float64
DIS float64
RAD int64
TAX int64
PT float64
B float64
LSTAT float64
MV float64


You can use the ```df.describe``` method to learn more about the dataframe and its columns

In [36]:
df2.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PT,B,LSTAT,MV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3795.042696,9.549407,408.237154,18.455534,356.67403,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148862,2105.710142,8.707259,168.537116,2.164946,91.294863,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1129.600048,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2100.174904,4.0,279.0,17.4,375.377487,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3207.449913,5.0,330.0,19.05,391.440002,11.36,21.200001
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.074999,5188.425183,24.0,666.0,20.200001,396.225006,16.954999,25.0
max,88.976196,100.0,27.74,1.0,0.871,8.78,100.0,12126.50013,24.0,711.0,22.0,396.899994,37.970001,50.0


In [37]:
df2.shape

(506, 14)

# More about this dataset if you're curious:

https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html


Learn more about using panadas at the link below! It's a 10-minute tutorial!

https://pandas.pydata.org/pandas-docs/stable/10min.html
