In [15]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import datetime

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

%matplotlib inline
import matplotlib.pyplot as plt
# pd.options.display.mpl_style = 'default'

'1.2.4'

# A Tour of Pandas

## The pandas Series Object

In [2]:
s = Series([1,2,3,4])
print("Create a Series Object:\n", s)

Create a Series Object:
 0    1
1    2
2    3
3    4
dtype: int64


In [3]:
print("return a Series with the rows with labels 1 and 3\n", s[[1,3]])


return a Series with the rows with labels 1 and 3
 1    2
3    4
dtype: int64


In [4]:
s = Series([1,2,3,4], index = ['a', 'b','c','d'])
print("Create a Series Object with explicit indexes:\n", s)

Create a Series Object with explicit indexes:
 a    1
b    2
c    3
d    4
dtype: int64


In [5]:
print("lookup items the series having index 'a' and 'd'\n", s[['a', 'd']]) 

lookup items the series having index 'a' and 'd'
 a    1
d    4
dtype: int64


In [6]:
print("get only the index of the series:", s.index)

get only the index of the series: Index(['a', 'b', 'c', 'd'], dtype='object')


In [7]:
dates = pd.date_range('2014-07-01', '2014-07-06')
print('create a Series who\'s index is a series of dates between two specified dates:\n', dates)

create a Series who's index is a series of dates between two specified dates:
 DatetimeIndex(['2014-07-01', '2014-07-02', '2014-07-03', '2014-07-04',
               '2014-07-05', '2014-07-06'],
              dtype='datetime64[ns]', freq='D')


In [8]:
temps1 = Series([80, 82, 85, 90, 83, 87], index = dates)
print('create a Series with values for each date in the index:\n', temps1)

create a Series with values for each date in the index:
 2014-07-01    80
2014-07-02    82
2014-07-03    85
2014-07-04    90
2014-07-05    83
2014-07-06    87
Freq: D, dtype: int64


## DataFrame

In [9]:
temps2 = Series([70, 75, 69, 83, 79, 77], index=dates)

temps_df = DataFrame({'Missoula': temps1, 'Philadelphia': temps2})
print('DataFrame from two series:\n', temps_df)

DataFrame from two series:
             Missoula  Philadelphia
2014-07-01        80            70
2014-07-02        82            75
2014-07-03        85            69
2014-07-04        90            83
2014-07-05        83            79
2014-07-06        87            77


In [10]:
temps_df['Difference'] = temps1 - temps2
print('Add a column to temps_df that contains the difference in temps:\n', temps_df)

Add a column to temps_df that contains the difference in temps:
             Missoula  Philadelphia  Difference
2014-07-01        80            70          10
2014-07-02        82            75           7
2014-07-03        85            69          16
2014-07-04        90            83           7
2014-07-05        83            79           4
2014-07-06        87            77          10


In [11]:
print('get the row at array position 1(iloc):\n', temps_df.iloc[0])

get the row at array position 1(iloc):
 Missoula        80
Philadelphia    70
Difference      10
Name: 2014-07-01 00:00:00, dtype: int64


In [12]:
print('retrieve row by index label using .loc:\n', temps_df.loc['2014-07-05'])

retrieve row by index label using .loc:
 Missoula        83
Philadelphia    79
Difference       4
Name: 2014-07-05 00:00:00, dtype: int64


## Read from csv

In [13]:
df = pd.read_csv('test.csv', parse_dates=['date'], index_col='date')
print(df)

FileNotFoundError: [Errno 2] No such file or directory: 'test.csv'

## Using numpy in Pandas

In [None]:
def squares(values):
    result = []
    for v in values:
        result.append(v * v)
    return result

to_square = range(100000)
print("time consuming using for loop")
%timeit squares(squares(to_square))
array_to_square = np.arange(0, 100000)
print("using numpy vectorized operation")
%timeit array_to_square ** 2

In [None]:
print("shorthand to repeat a sequence 10 times:\n", np.array([0] * 10))

In [None]:
print("counting down", np.arange(10, 0, -1))

In [None]:
print("evenly spaced #'s between two intervals:\n'", np.linspace(0, 10, 11))

In [None]:
def exp(x):
    return x < 3 or x > 3
print("create a function that is applied to all array elements:\n", np.vectorize(exp)(np.arange(0, 10)))

In [None]:
s10 = pd.Series([1,2,3,4,5], index=(['a', 'a', 'b', 'c', 'd']))
s11 = pd.Series([1,2,3,4,5], index=(['a', 'a', 'c', 'd', 'e']))
print("Two series added by index,notice there will be four a\n ",s10+s11)

## tidying up the data

In [None]:
# prepare data
df = DataFrame(np.arange(0, 15).reshape(5, 3), index = ['a', 'b', 'c', 'd', 'e'], columns=['c1', 'c2', 'c3'])
print(df)

In [None]:
df['c4'] = np.nan
df.loc['f'] = np.arange(15, 19)
df.loc['g'] = np.nan
df['c5'] = np.nan
df['c4']['a'] = 20
print(df)

In [None]:
print("which items are null:\n", df.isnull())

In [None]:
print("count the number of NaN values in each columns:\n", df.isnull().sum())

In [None]:
print("total count of NaN values:\n", df.isnull().sum().sum())

In [None]:
print("select the non-NaN items in column c4: \n", df.c4[df.c4.notnull()])

In [None]:
print("easy way to dropna, it doesn't affect the origin df:\n", df.c4.dropna())

### some simple method for dropping nan values.

In [None]:
df2 = df.copy()
df2.dropna(how='all')
df2.dropna(how='all', axis=1)
df2.dropna(how='any', axis=1)
df2.dropna(thresh=5, axis=1)

In [None]:
s = Series([1,2,np.nan, 3])
print("nan is ignored in pandas:\n", s.mean())

### filling in missing data

In [None]:
print("filling data with limited:\n", df.fillna(0, limit=2))

In [None]:
print("forward fill:\n", df.fillna(method='ffill'))

In [None]:
print("backward fill:\n", df.fillna(method='bfill'))

In [None]:
print("mean fill:\n", df.fillna(df.mean()))

### Interpolation of missing values

In [None]:
s = pd.Series([1,np.nan,np.nan,np.nan,np.nan,np.nan,2])
print("interpolation s:\n", s.interpolate())

In [None]:
ts = pd.Series([1, np.nan, 2], index=[datetime.datetime(2014, 1, 1), datetime.datetime(2014, 2, 1), datetime.datetime(2014, 4, 1)])
print(ts)

In [None]:
print("interpolate by datetime\n", ts.interpolate(method='time'))

In [None]:
s = pd.Series([0, np.nan, 100], index=[0, 1, 10])
print("interpolate by values\n", s.interpolate(method='values'))