In [1]:
greetings = "Assalam-o-Alaikum!"
greetings

'Assalam-o-Alaikum!'

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
## Creating a DataFrame
dates = pd.date_range("20130101", periods = 6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
## Creating a DataFrame
df1 = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list("ABCD"))
df1

Unnamed: 0,A,B,C,D
2013-01-01,1.66195,-0.372455,1.200514,-0.486748
2013-01-02,-0.635395,1.496526,-0.354694,0.101508
2013-01-03,-0.924444,-1.78966,0.732251,-0.533106
2013-01-04,1.223275,-0.085664,-0.002108,-0.772367
2013-01-05,0.242475,-1.733191,-0.867178,-0.759221
2013-01-06,0.200475,1.653866,-0.661205,0.544795


# Grouping

In [7]:
df = pd.DataFrame({"A": ["foo", "bar", "foo", "bar",
                         "foo", "bar", "foo", "foo"],
                   "B": ["one", "one", "two", "three",
                         "two", "two", "one", "three"],
                   "C": np.random.randn(8),
                   "D": np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.483674,2.533798
1,bar,one,-0.624083,-0.989165
2,foo,two,-1.155533,2.525192
3,bar,three,-0.113824,1.940872
4,foo,two,0.668372,0.398532
5,bar,two,0.31228,-1.543667
6,foo,one,-0.075601,1.276889
7,foo,three,-1.129649,-1.854476


In [8]:
# grouping

df.groupby("A").sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.425627,-0.591961
foo,-2.176084,4.879936


In [9]:
# groupby hierarchical index

df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.624083,-0.989165
bar,three,-0.113824,1.940872
bar,two,0.31228,-1.543667
foo,one,-0.559274,3.810687
foo,three,-1.129649,-1.854476
foo,two,-0.48716,2.923725


# Reshaping

In [14]:
# Stack

tuples = list(zip(*[["bar", "bar", "baz", "baz",
                    "foo", "foo", "qux", "qux"],
                    ["one", "two", "one", "two",
                     "one", "two", "one", "two"]]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [18]:
index = pd.MultiIndex.from_tuples(tuples, names = ["first", "second"])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [19]:
df = pd.DataFrame(np.random.randn(8, 2), index = index, columns = ["A", "B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.506794,3.646829
bar,two,-1.582439,-0.438642
baz,one,-1.259106,0.358215
baz,two,-0.100952,1.052557
foo,one,0.390409,-0.312327
foo,two,0.74943,-0.233268
qux,one,-1.16249,-1.04377
qux,two,-1.271155,1.149906


In [21]:
# stack method

stacked = df.stack()
stacked

first  second   
bar    one     A   -0.506794
               B    3.646829
       two     A   -1.582439
               B   -0.438642
baz    one     A   -1.259106
               B    0.358215
       two     A   -0.100952
               B    1.052557
foo    one     A    0.390409
               B   -0.312327
       two     A    0.749430
               B   -0.233268
qux    one     A   -1.162490
               B   -1.043770
       two     A   -1.271155
               B    1.149906
dtype: float64

In [22]:
# unstack

stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.506794,3.646829
bar,two,-1.582439,-0.438642
baz,one,-1.259106,0.358215
baz,two,-0.100952,1.052557
foo,one,0.390409,-0.312327
foo,two,0.74943,-0.233268
qux,one,-1.16249,-1.04377
qux,two,-1.271155,1.149906


In [28]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.506794,-1.582439
bar,B,3.646829,-0.438642
baz,A,-1.259106,-0.100952
baz,B,0.358215,1.052557
foo,A,0.390409,0.74943
foo,B,-0.312327,-0.233268
qux,A,-1.16249,-1.271155
qux,B,-1.04377,1.149906


In [29]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz,foo,qux
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,A,-0.506794,-1.259106,0.390409,-1.16249
one,B,3.646829,0.358215,-0.312327,-1.04377
two,A,-1.582439,-0.100952,0.74943,-1.271155
two,B,-0.438642,1.052557,-0.233268,1.149906


# Pivot Tables

In [30]:
df = pd.DataFrame({"A" : ["one", "one", "two", "three"] * 3,
                   "B" : ["A", "B", "C"] * 4,
                   "C" : ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
                   "D" : np.random.random(12),
                   "E" : np.random.random(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.178604,0.226478
1,one,B,foo,0.0339,0.093279
2,two,C,foo,0.758861,0.246628
3,three,A,bar,0.050237,0.46405
4,one,B,bar,0.523569,0.169505
5,one,C,bar,0.814321,0.733765
6,two,A,foo,0.133133,0.458055
7,three,B,foo,0.214781,0.590684
8,one,C,foo,0.541307,0.420229
9,one,A,bar,0.083406,0.14267


In [31]:
pd.pivot_table(df, values = "D", index = ["A", "B"], columns = ["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.083406,0.178604
one,B,0.523569,0.0339
one,C,0.814321,0.541307
three,A,0.050237,
three,B,,0.214781
three,C,0.60799,
two,A,,0.133133
two,B,0.538064,
two,C,,0.758861


# Time Series

In [32]:
rng = pd.date_range("1/1/2012", periods = 100, freq = "S")
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-

In [33]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index = rng)
ts

2012-01-01 00:00:00    182
2012-01-01 00:00:01    317
2012-01-01 00:00:02     41
2012-01-01 00:00:03    428
2012-01-01 00:00:04    148
                      ... 
2012-01-01 00:01:35    485
2012-01-01 00:01:36    492
2012-01-01 00:01:37    156
2012-01-01 00:01:38    237
2012-01-01 00:01:39    419
Freq: S, Length: 100, dtype: int32

In [38]:
# Time zone representation

rng = pd.date_range("3/6/2012 00:00", periods = 5, freq = "D")
rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

In [39]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2012-03-06    0.321752
2012-03-07    0.253475
2012-03-08    1.134411
2012-03-09   -0.899937
2012-03-10   -1.255310
Freq: D, dtype: float64

In [40]:
ts_utc = ts.tz_localize("UTC")
ts_utc

2012-03-06 00:00:00+00:00    0.321752
2012-03-07 00:00:00+00:00    0.253475
2012-03-08 00:00:00+00:00    1.134411
2012-03-09 00:00:00+00:00   -0.899937
2012-03-10 00:00:00+00:00   -1.255310
Freq: D, dtype: float64

In [41]:
ts_utc.tz_convert("US/Eastern")

2012-03-05 19:00:00-05:00    0.321752
2012-03-06 19:00:00-05:00    0.253475
2012-03-07 19:00:00-05:00    1.134411
2012-03-08 19:00:00-05:00   -0.899937
2012-03-09 19:00:00-05:00   -1.255310
Freq: D, dtype: float64

In [42]:
rng = pd.date_range("1/1/2012", periods = 5, freq = "M")
rng

DatetimeIndex(['2012-01-31', '2012-02-29', '2012-03-31', '2012-04-30',
               '2012-05-31'],
              dtype='datetime64[ns]', freq='M')

In [43]:
ts = pd.Series(np.random.randn(len(rng)), index = rng)
ts

2012-01-31   -1.422230
2012-02-29   -0.536121
2012-03-31   -2.060088
2012-04-30   -0.976581
2012-05-31    1.796125
Freq: M, dtype: float64

In [44]:
ps = ts.to_period()
ps

2012-01   -1.422230
2012-02   -0.536121
2012-03   -2.060088
2012-04   -0.976581
2012-05    1.796125
Freq: M, dtype: float64

In [45]:
ps.to_timestamp()

2012-01-01   -1.422230
2012-02-01   -0.536121
2012-03-01   -2.060088
2012-04-01   -0.976581
2012-05-01    1.796125
Freq: MS, dtype: float64

In [48]:
prng = pd.period_range("1990Q1", "2000Q4", freq = "Q-NOV")
prng

PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
             '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
             '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
             '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
             '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
             '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
             '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
             '2000Q3', '2000Q4'],
            dtype='period[Q-NOV]', freq='Q-NOV')

In [49]:
ts = pd.Series(np.random.randn(len(prng)), prng)
ts

1990Q1   -1.038857
1990Q2    0.496230
1990Q3    1.464781
1990Q4   -0.751079
1991Q1   -0.903076
1991Q2    1.165859
1991Q3    1.414115
1991Q4   -2.543337
1992Q1   -0.324895
1992Q2    0.629263
1992Q3   -0.095007
1992Q4    0.662428
1993Q1    0.371616
1993Q2   -1.108551
1993Q3    0.374851
1993Q4   -0.656103
1994Q1    0.086791
1994Q2   -0.348730
1994Q3    0.382221
1994Q4    1.170276
1995Q1    0.967427
1995Q2    0.215320
1995Q3   -0.465727
1995Q4   -0.285159
1996Q1    1.087093
1996Q2    1.732674
1996Q3    0.403091
1996Q4    1.772613
1997Q1   -0.997330
1997Q2   -2.865858
1997Q3   -1.361326
1997Q4    1.708475
1998Q1    0.414551
1998Q2    0.519737
1998Q3    0.658314
1998Q4   -0.464369
1999Q1    0.476004
1999Q2   -0.046060
1999Q3   -2.175658
1999Q4    0.279900
2000Q1    0.552042
2000Q2   -0.297179
2000Q3   -1.326414
2000Q4    2.341628
Freq: Q-NOV, dtype: float64

In [50]:
ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9
ts.head()

1990-03-01 09:00   -1.038857
1990-06-01 09:00    0.496230
1990-09-01 09:00    1.464781
1990-12-01 09:00   -0.751079
1991-03-01 09:00   -0.903076
Freq: H, dtype: float64

# Categoricals

In [54]:
df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]})
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [59]:
# categorical data

df["grade"] = df["raw_grade"].astype("category")
df

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [60]:
# Rename Categories

df["grade"].cat.categories = ["very good", "good", "very bad"]
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [61]:
# Sorting

df.sort_values(by = "grade")

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
3,4,a,very good
4,5,a,very good
1,2,b,good
2,3,b,good
5,6,e,very bad


In [62]:
# Grouping

df.groupby("grade").size()

grade
very good    3
good         2
very bad     1
dtype: int64