# Load csv

In [63]:
import csv
import pandas as pd

file_name = 'us_marriages_divorces.csv'
df = pd.read_csv(file_name, encoding='latin-1')

# How many rows and columns in total

In [64]:
df.shape

(145, 6)

# Print head of DataFrame

In [65]:
df.head()

Unnamed: 0,Year,Marriages,Divorces,Population,Marriages_per_1000,Divorces_per_1000
0,1867,357000.0,10000.0,36970000,9.7,0.3
1,1868,345000.0,10000.0,37885000,9.1,0.3
2,1869,348000.0,11000.0,38870000,9.0,0.3
3,1870,352000.0,11000.0,39905000,8.8,0.3
4,1871,359000.0,12000.0,41010000,8.8,0.3


# Print columns in DF

In [66]:
df.columns

Index(['Year', 'Marriages', 'Divorces', 'Population', 'Marriages_per_1000',
       'Divorces_per_1000'],
      dtype='object')

# Slice 'Year' column

In [67]:
df.Year[0:3]

0    1867
1    1868
2    1869
Name: Year, dtype: int64

In [68]:
df['Year'][0:3]

0    1867
1    1868
2    1869
Name: Year, dtype: int64

# Data Type

In [69]:
type(df.Year)

pandas.core.series.Series

# View certain columns

In [70]:
df[['Year','Marriages','Divorces']][:4]

Unnamed: 0,Year,Marriages,Divorces
0,1867,357000.0,10000.0
1,1868,345000.0,10000.0
2,1869,348000.0,11000.0
3,1870,352000.0,11000.0


# Max, Min, Mean

In [71]:
df['Divorces'].mean()

414576.6241134752

In [72]:
df['Year'].max()

2011

In [73]:
df['Marriages'].min()

345000.0

# Print Data Set Statistics

In [74]:
df.describe()

Unnamed: 0,Year,Marriages,Divorces,Population,Marriages_per_1000,Divorces_per_1000
count,145.0,141.0,141.0,145.0,141.0,141.0
mean,1939.0,1397450.0,414576.6,150091700.0,9.635461,2.11844
std,42.001984,695750.5,413093.5,80287760.0,1.319152,1.519615
min,1867.0,345000.0,10000.0,36970000.0,6.8,0.3
25%,1903.0,776000.0,61000.0,80632000.0,8.8,0.8
50%,1939.0,1369000.0,244000.0,131028000.0,9.6,1.9
75%,1975.0,2157000.0,840000.0,215981000.0,10.4,2.9
max,2011.0,2482137.0,1233226.0,312034000.0,16.2,5.3


In [75]:
df['Marriages'].describe()

count    1.410000e+02
mean     1.397450e+06
std      6.957505e+05
min      3.450000e+05
25%      7.760000e+05
50%      1.369000e+06
75%      2.157000e+06
max      2.482137e+06
Name: Marriages, dtype: float64

# Conditionally select data

In [76]:
df[df.Year >=2007]

Unnamed: 0,Year,Marriages,Divorces,Population,Marriages_per_1000,Divorces_per_1000
140,2007,2197000.0,856000.0,301696000,7.3,2.8
141,2008,2157000.0,844000.0,304543000,7.1,2.8
142,2009,2080000.0,840000.0,307240000,6.8,2.7
143,2010,2096000.0,872000.0,309776000,6.8,2.8
144,2011,2118000.0,877000.0,312034000,6.8,2.8


In [77]:
df['Marriages'][df.Year> 2008]

142    2080000.0
143    2096000.0
144    2118000.0
Name: Marriages, dtype: float64

In [78]:
df[df['Year'] == df.Year.max()]

Unnamed: 0,Year,Marriages,Divorces,Population,Marriages_per_1000,Divorces_per_1000
144,2011,2118000.0,877000.0,312034000,6.8,2.8


In [79]:
df[df['Marriages'] == df.Marriages.max()]

Unnamed: 0,Year,Marriages,Divorces,Population,Marriages_per_1000,Divorces_per_1000
117,1984,2482137.0,1181970.0,236394000,10.5,5.0


In [80]:
df.Year[df['Divorces_per_1000'] == df.Divorces_per_1000.max()]

112    1979
114    1981
Name: Year, dtype: int64

In [81]:
df[['Year','Marriages_per_1000']][df['Marriages_per_1000'] == df.Marriages_per_1000.max()]

Unnamed: 0,Year,Marriages_per_1000
79,1946,16.2


# Plot Mariages vs Year and Divorces vs Year

In [82]:
from bokeh.plotting import show, figure, output_notebook
from bokeh.models import ColumnDataSource
output_notebook()
source = ColumnDataSource(df)

p = figure(title = 'Mariages vs Year and Divorces vs Year')
p.circle(x = 'Year', y = 'Marriages',
         legend = 'Marriage',
         source = source)
p.circle(x = 'Year', y = 'Divorces', source = source, color = 'red',
        legend = 'Divorce',)

show(p)

# Plot Mariages per 100 vs Year and Divorces per 1000 vs Year

In [83]:
p = figure(title = 'Mariages per 100 vs Year and Divorces per 1000 vs Year')
p.circle(x = 'Year', y = 'Marriages_per_1000',
         legend = 'Marriage_per_1000',
         source = source)
p.circle(x = 'Year', y = 'Divorces_per_1000', source = source, color = 'red',
        legend = 'Divorce_per_1000',)

show(p)

# Index

In [84]:
df.index

RangeIndex(start=0, stop=145, step=1)

In [85]:
df.head()

Unnamed: 0,Year,Marriages,Divorces,Population,Marriages_per_1000,Divorces_per_1000
0,1867,357000.0,10000.0,36970000,9.7,0.3
1,1868,345000.0,10000.0,37885000,9.1,0.3
2,1869,348000.0,11000.0,38870000,9.0,0.3
3,1870,352000.0,11000.0,39905000,8.8,0.3
4,1871,359000.0,12000.0,41010000,8.8,0.3


# Rename index

In [87]:
df.set_index('Year', inplace=True)

In [88]:
df.head()

Unnamed: 0_level_0,Marriages,Divorces,Population,Marriages_per_1000,Divorces_per_1000
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1867,357000.0,10000.0,36970000,9.7,0.3
1868,345000.0,10000.0,37885000,9.1,0.3
1869,348000.0,11000.0,38870000,9.0,0.3
1870,352000.0,11000.0,39905000,8.8,0.3
1871,359000.0,12000.0,41010000,8.8,0.3


In [90]:
df.loc[1867]

Marriages               357000.0
Divorces                 10000.0
Population            36970000.0
Marriages_per_1000           9.7
Divorces_per_1000            0.3
Name: 1867, dtype: float64

In [92]:
df.reset_index(inplace=True)

In [94]:
df.head()

Unnamed: 0,index,Year,Marriages,Divorces,Population,Marriages_per_1000,Divorces_per_1000
0,0,1867,357000.0,10000.0,36970000,9.7,0.3
1,1,1868,345000.0,10000.0,37885000,9.1,0.3
2,2,1869,348000.0,11000.0,38870000,9.0,0.3
3,3,1870,352000.0,11000.0,39905000,8.8,0.3
4,4,1871,359000.0,12000.0,41010000,8.8,0.3


In [100]:
df.set_index('Marriages', inplace=True)

In [111]:
df.Year[df['Divorces_per_1000'] == df.Divorces_per_1000.max()]

Marriages
2341102.0    1979
2438085.0    1981
Name: Year, dtype: int64

In [114]:
df.loc[357000]

index                        0.0
Year                      1867.0
Population            36970000.0
Marriages_per_1000           9.7
Divorces_per_1000            0.3
Name: 357000.0, dtype: float64