# Pandas Learnings
10 mins tutorial content

In [1]:
import pandas as pd
import numpy as np

Object creation in Pandas

Creating a Series by passing a list of values, letting pandas create a default integer index:

In [4]:
df=pd.Series([1,2,3,4,5,6,7,8])
df

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

In [9]:
dates=pd.date_range("20200201",periods=6)
dates

DatetimeIndex(['2020-02-01', '2020-02-02', '2020-02-03', '2020-02-04',
               '2020-02-05', '2020-02-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2020-02-01,0.42331,-0.288351,1.028279,1.348678
2020-02-02,0.998878,0.52987,0.413705,0.857294
2020-02-03,-3.284321,1.461194,-0.877865,-1.049186
2020-02-04,0.810973,-2.043113,1.846365,0.706133
2020-02-05,-2.694451,0.809791,0.08053,-0.580295
2020-02-06,0.848766,0.223243,-0.467514,-1.099651


In [11]:
np.random.randn(6,4)

array([[ 0.10742472, -0.48976563, -0.30115047, -0.03843415],
       [-0.70506519, -0.64818457, -0.44649299, -2.04172419],
       [-0.11511407, -1.7451738 , -0.76810902, -0.715306  ],
       [-0.41464584,  0.30196483,  0.24450917,  2.24782252],
       [-0.88469162, -0.09074655,  0.25100061,  1.36169719],
       [ 0.23394469, -1.56932078, -1.35175771,  0.16844538]])

In [12]:
print(np.random.randn(6,4))

[[-0.07434855 -1.65418126 -0.06339442  1.56485277]
 [-0.69291203  0.28520556 -0.38565272 -0.10142109]
 [ 1.27843448 -0.71280135  0.47129415 -1.35953395]
 [-1.16347874 -1.07821787  1.19431253  0.36557265]
 [ 0.32569915  1.05712443 -1.43330618 -0.17684916]
 [-1.11692609  0.0952217  -0.79233388  1.41194543]]


In [14]:
series_index=pd.Series(["Person A","Person B","Person C","Person D","Person E"])
series_grade=pd.Series(["A","B","A","A","C"])
df=pd.DataFrame(np.random.randn(5,4),index=series_index,columns=["Persons","Subject 1","Subject 2","Subject 3"])
df

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3
Person A,-0.616611,0.21828,0.760031,0.780423
Person B,-1.078677,-0.839531,-0.291024,-0.967472
Person C,-0.864177,-0.878428,1.180083,-1.072881
Person D,-1.849804,-0.119028,-0.530153,-0.369607
Person E,-1.595798,0.181604,0.106823,0.647071


Creating a DataFrame by passing a dict of objects that can be converted to series-like.


In [16]:
df2 = pd.DataFrame({
   ...:         "A": 1.0,
   ...:         "B": pd.Timestamp("20130102"),
   ...:         "C": pd.Series(1, index=list(range(4)), dtype="float32"),
   ...:         "D": np.array([3] * 4, dtype="int32"),
   ...:         "E": pd.Categorical(["test", "train", "test", "train"]),
   ...:         "F": "foo",
   ...:     }
   ...: )
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [19]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [20]:
df.head()

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3
Person A,-0.616611,0.21828,0.760031,0.780423
Person B,-1.078677,-0.839531,-0.291024,-0.967472
Person C,-0.864177,-0.878428,1.180083,-1.072881
Person D,-1.849804,-0.119028,-0.530153,-0.369607
Person E,-1.595798,0.181604,0.106823,0.647071


In [21]:
df.tail(2)

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3
Person D,-1.849804,-0.119028,-0.530153,-0.369607
Person E,-1.595798,0.181604,0.106823,0.647071


In [22]:
df.index

Index(['Person A', 'Person B', 'Person C', 'Person D', 'Person E'], dtype='object')

In [23]:
df.columns

Index(['Persons', 'Subject 1', 'Subject 2', 'Subject 3'], dtype='object')

DataFrame.to_numpy() gives a NumPy representation of the underlying data.
NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column.
When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object.

In [24]:
df.to_numpy()

array([[-0.61661067,  0.21827992,  0.76003056,  0.78042268],
       [-1.07867693, -0.83953136, -0.29102357, -0.96747193],
       [-0.86417719, -0.87842816,  1.18008292, -1.07288121],
       [-1.84980443, -0.11902821, -0.53015291, -0.36960661],
       [-1.59579835,  0.18160367,  0.10682344,  0.64707067]])

In [26]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [27]:
df.transpose()

Unnamed: 0,Person A,Person B,Person C,Person D,Person E
Persons,-0.616611,-1.078677,-0.864177,-1.849804,-1.595798
Subject 1,0.21828,-0.839531,-0.878428,-0.119028,0.181604
Subject 2,0.760031,-0.291024,1.180083,-0.530153,0.106823
Subject 3,0.780423,-0.967472,-1.072881,-0.369607,0.647071


In [28]:
df.T

Unnamed: 0,Person A,Person B,Person C,Person D,Person E
Persons,-0.616611,-1.078677,-0.864177,-1.849804,-1.595798
Subject 1,0.21828,-0.839531,-0.878428,-0.119028,0.181604
Subject 2,0.760031,-0.291024,1.180083,-0.530153,0.106823
Subject 3,0.780423,-0.967472,-1.072881,-0.369607,0.647071


In [30]:
df

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3
Person A,-0.616611,0.21828,0.760031,0.780423
Person B,-1.078677,-0.839531,-0.291024,-0.967472
Person C,-0.864177,-0.878428,1.180083,-1.072881
Person D,-1.849804,-0.119028,-0.530153,-0.369607
Person E,-1.595798,0.181604,0.106823,0.647071


In [31]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,Subject 3,Subject 2,Subject 1,Persons
Person A,0.780423,0.760031,0.21828,-0.616611
Person B,-0.967472,-0.291024,-0.839531,-1.078677
Person C,-1.072881,1.180083,-0.878428,-0.864177
Person D,-0.369607,-0.530153,-0.119028,-1.849804
Person E,0.647071,0.106823,0.181604,-1.595798


In [33]:
df.sort_values(by="Subject 3")

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3
Person C,-0.864177,-0.878428,1.180083,-1.072881
Person B,-1.078677,-0.839531,-0.291024,-0.967472
Person D,-1.849804,-0.119028,-0.530153,-0.369607
Person E,-1.595798,0.181604,0.106823,0.647071
Person A,-0.616611,0.21828,0.760031,0.780423


Selecting in pandas

In [36]:
df["Persons"]

Person A   -0.616611
Person B   -1.078677
Person C   -0.864177
Person D   -1.849804
Person E   -1.595798
Name: Persons, dtype: float64

In [38]:
df[0:2]

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3
Person A,-0.616611,0.21828,0.760031,0.780423
Person B,-1.078677,-0.839531,-0.291024,-0.967472


In [40]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [42]:
df2.loc[:, ["A", "B"]]

Unnamed: 0,A,B
0,1.0,2013-01-02
1,1.0,2013-01-02
2,1.0,2013-01-02
3,1.0,2013-01-02


In [43]:
df2.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B


In [45]:
df.iloc[1]

Persons     -1.078677
Subject 1   -0.839531
Subject 2   -0.291024
Subject 3   -0.967472
Name: Person B, dtype: float64

In [46]:
df.iloc[3:5, 0:2]

Unnamed: 0,Persons,Subject 1
Person D,-1.849804,-0.119028
Person E,-1.595798,0.181604


In [47]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,Persons,Subject 2
Person B,-1.078677,-0.291024
Person C,-0.864177,1.180083
Person E,-1.595798,0.106823


In [50]:
df[df["Subject 1"] > 0]

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3
Person A,-0.616611,0.21828,0.760031,0.780423
Person E,-1.595798,0.181604,0.106823,0.647071


In [51]:
df[df > 0]

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3
Person A,,0.21828,0.760031,0.780423
Person B,,,,
Person C,,,1.180083,
Person D,,,,
Person E,,0.181604,0.106823,0.647071


In [52]:
df2[df2["E"].isin(["two", "four"])]


Unnamed: 0,A,B,C,D,E,F


In [53]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [54]:
df

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3
Person A,-0.616611,0.21828,0.760031,0.780423
Person B,-1.078677,-0.839531,-0.291024,-0.967472
Person C,-0.864177,-0.878428,1.180083,-1.072881
Person D,-1.849804,-0.119028,-0.530153,-0.369607
Person E,-1.595798,0.181604,0.106823,0.647071


In [55]:
df["Subject 3"]

Person A    0.780423
Person B   -0.967472
Person C   -1.072881
Person D   -0.369607
Person E    0.647071
Name: Subject 3, dtype: float64

In [58]:
df["Grade"]=["A","B","A","C","A"]
df

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3,Grade
Person A,-0.616611,0.21828,0.760031,0.780423,A
Person B,-1.078677,-0.839531,-0.291024,-0.967472,B
Person C,-0.864177,-0.878428,1.180083,-1.072881,A
Person D,-1.849804,-0.119028,-0.530153,-0.369607,C
Person E,-1.595798,0.181604,0.106823,0.647071,A


isin() method used for filtering

In [62]:
df[df["Grade"].isin(["A", "B"])]

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3,Grade
Person A,-0.616611,0.21828,0.760031,0.780423,A
Person B,-1.078677,-0.839531,-0.291024,-0.967472,B
Person C,-0.864177,-0.878428,1.180083,-1.072881,A
Person E,-1.595798,0.181604,0.106823,0.647071,A


Setting values by label

In [63]:
 df.at[dates[0], "A"] = 0

In [64]:
df

Unnamed: 0,Persons,Subject 1,Subject 2,Subject 3,Grade,A
Person A,-0.616611,0.21828,0.760031,0.780423,A,
Person B,-1.078677,-0.839531,-0.291024,-0.967472,B,
Person C,-0.864177,-0.878428,1.180083,-1.072881,A,
Person D,-1.849804,-0.119028,-0.530153,-0.369607,C,
Person E,-1.595798,0.181604,0.106823,0.647071,A,
2020-02-01 00:00:00,,,,,,0.0
