In [5]:
# Importing the NumPy library
import numpy as np
from IPython.display import display

In [6]:
# Using functions
display(np.sqrt(9))

3.0

In [7]:
display(np.square(3))

9

In [8]:
display(np.maximum(2,3))

3

In [95]:
display(np.abs(-10))

10

In [99]:
display(np.sign(-222))

-1

In [11]:
# Creating arrays
a1 = np.array([1,2,3,4])
display(isinstance(a1,np.ndarray))

True

In [12]:
display(a1.shape)

(4,)

In [13]:
a2 = np.array([[1,2,3],[4,5,6]])
display(a2.shape)

(2, 3)

In [14]:
display(a2.dtype)

dtype('int32')

In [15]:
a3 = np.arange(27).reshape(3,3,3)
display(a3.shape)

(3, 3, 3)

In [16]:
a4 = np.zeros((5,2),dtype=bool)
display(a4)

array([[False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False]])

In [17]:
a5 = np.empty(10)
display(a5)

array([6.23042070e-307, 3.56043053e-307, 1.60219306e-306, 2.44763557e-307,
       1.69119330e-306, 7.56599807e-307, 8.90104239e-307, 1.24610383e-306,
       1.69118108e-306, 2.13620807e-306])

In [18]:
# The datatypes; int, float, bool, object, may be inferred or declared
a6 = np.array([[1,2],[3,4],[5]],dtype=object)
display(a6.shape)

(3,)

In [101]:
display(a6.dtype)

dtype('O')

In [20]:
a7 = np.array([1,2,3.14])
display(a7.shape)
display(a7.dtype)

(3,)

dtype('float64')

In [21]:
# Accessing arrays
a = np.array([1,2,3,4])
display(a[0])

1

In [22]:
display(a[1:3])

array([2, 3])

In [23]:
display(a[:3])

array([1, 2, 3])

In [24]:
display(a[2:])

array([3, 4])

In [25]:
display(a[[1,1]])

array([2, 2])

In [26]:
display(a[[True,False,False,True]])

array([1, 4])

In [27]:
b = np.arange(12).reshape(4,3)
display(b[3,2])

11

In [28]:
display(b[1:3,:2])

array([[3, 4],
       [6, 7]])

In [29]:
display(b[[0,3],[0,2]])

array([ 0, 11])

In [30]:
# Assigning values to array elements

a[0] += 1
display(a)

array([2, 2, 3, 4])

In [31]:
a[2:] = np.array([5,6])
display(a)

array([2, 2, 5, 6])

In [32]:
a[:2] = [3,4]
display(a)

array([3, 4, 5, 6])

In [33]:
b = a[1:3]
display(b)

array([4, 5])

In [34]:
b[0] = 3
display(b)
display(a)

array([3, 5])

array([3, 3, 5, 6])

In [35]:
c = a.copy()
c[1] = 7
display(c)
display(a)

array([3, 7, 5, 6])

array([3, 3, 5, 6])

In [36]:
# Operations involving arrays
a = np.array([1,2,3])
display(np.square(a))

array([1, 4, 9], dtype=int32)

In [37]:
display(np.sign(a))

array([1, 1, 1])

In [38]:
# Operations involving multiple arrays of same size
a = np.array([1,2,3])
b = np.array([4,5,6])
c = a+b
display(c)

array([5, 7, 9])

In [103]:
# Operations involving operands of different size (broadcasting);
# works when differing dimension size equals 1 for one operand
a = np.array([1,2])
b = np.array([4,5,6])
# c = a+b

In [40]:
d = 5*b
display(d)

array([20, 25, 30])

In [41]:
e = b > 5
display(e)

array([False, False,  True])

In [42]:
f = np.array([[1,2],[3,4]])
g = a+f
display(g)

array([[2, 4],
       [4, 6]])

In [43]:
h = np.array([[1],[2]])
i = np.array([1,2])
j = h-i
display(j)

array([[ 0, -1],
       [ 1,  0]])

In [44]:
# Operating on multiple array elements with apply_along_axis
a = np.arange(15).reshape(5,3)
display(a)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [45]:
display(np.apply_along_axis(lambda x: np.sum(x),0,a))

array([30, 35, 40])

In [46]:
def f(x):
    return np.sum(x)
display(np.apply_along_axis(f,1,a))

array([ 3, 12, 21, 30, 39])

In [47]:
display(np.apply_along_axis(np.sum,1,a))

array([ 3, 12, 21, 30, 39])

In [48]:
# Finding the position of the maximum value with argmax
a = (np.arange(15)*2).reshape(5,3)
display(np.argmax(a))

14

In [49]:
display(np.argmax(a,0))

array([4, 4, 4], dtype=int64)

In [50]:
display(np.argmax(a,1))

array([2, 2, 2, 2, 2], dtype=int64)

In [51]:
# Representing missing values by (the float) np.nan
a = np.array([1,2,3,4],dtype=float)
a[2] = np.nan
display(a)

array([ 1.,  2., nan,  4.])

In [52]:
# The following results in error:
# b = np.array([1,2,3,4])
# b[2] = np.nan

In [53]:
c = np.array([True,np.nan,False,False])
display(c)
display(c.dtype)

array([ 1., nan,  0.,  0.])

dtype('float64')

In [54]:
display(np.nan == np.nan)

False

In [55]:
display(np.isnan(n7p.nan))

True

In [56]:
display(np.nan is np.nan)

True

In [57]:
# Counting with missing values
display(np.sum(np.array([1,2,np.nan,4])))

nan

In [58]:
display(np.nansum(np.array([1,2,np.nan,4])))

7.0

In [59]:
display(np.nanprod(np.array([1,2,np.nan,4])))

8.0

In [60]:
display(np.nanmin(np.array([1,2,np.nan,4])))

1.0

In [61]:
display(np.nanmax(np.array([1,2,np.nan,4])))

4.0

In [107]:
display(np.nanmean(np.array([1,2,np.nan,4])))

2.3333333333333335

In [63]:
# Importing the pandas library
import pandas as pd

In [113]:
# Creating DataFrames
values = np.arange(15).reshape(5,3)
df1 = pd.DataFrame(values,columns=["A","B","C"])
display(df1)

Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,12,13,14


In [65]:
df2 = pd.DataFrame(values,index=list("abcde"),
columns=list("ABC"))
display(df2)

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11
e,12,13,14


In [66]:
# Accessing DataFrames
display(df1["B"])

0     1
1     4
2     7
3    10
4    13
Name: B, dtype: int32

In [67]:
display(df1.B)

0     1
1     4
2     7
3    10
4    13
Name: B, dtype: int32

In [68]:
display(isinstance(df1["B"],pd.Series))

True

In [69]:
display(df1["B"].values)

array([ 1,  4,  7, 10, 13])

In [70]:
display(df1["B"].dtype)

dtype('int32')

In [71]:
display(df1["B"][1])

4

In [118]:
display(df1.loc[2:4,["B","C"]])

Unnamed: 0,B,C
2,7,8
3,10,11
4,13,14


In [117]:
display(df1.iloc[1:3,1:])

Unnamed: 0,B,C
1,4,5
2,7,8


In [74]:
display(df1.iloc[1:3])

Unnamed: 0,A,B,C
1,3,4,5
2,6,7,8


In [75]:
display(df1.iloc[:,1:])

Unnamed: 0,B,C
0,1,2
1,4,5
2,7,8
3,10,11
4,13,14


In [121]:
display(df2)
display(df2.loc[:,["A","B"]].iloc[0:3])

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11
e,12,13,14


Unnamed: 0,A,B
a,0,1
b,3,4
c,6,7


In [122]:
display(df2.loc["a":"c",["A","B"]])

Unnamed: 0,A,B
a,0,1
b,3,4
c,6,7


In [78]:
display(df2.loc[[True,True,False,False,False],[True,False,True]])

Unnamed: 0,A,C
a,0,2
b,3,5


In [79]:
display(df2.iloc[[True,True,False,False,False],[True,False,True]])

Unnamed: 0,A,C
a,0,2
b,3,5


In [80]:
display(df2[[True,True,False,False,False]])

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5


In [81]:
display(df2["B"]["b"])

4

In [126]:
display(df2.loc[["a","c"],["B","C"]])

Unnamed: 0,B,C
a,1,2
c,7,8


In [83]:
display(df2.loc["a":"d"])

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11


In [84]:
display(df2.iloc[0:4])

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11


In [85]:
display(df2.at["a","A"])

0

In [86]:
display(df2.iat[0,0])

0

In [87]:
# Accessing DataFrames through boolean indexing
display(df1[[True,False,True,False,True]])

Unnamed: 0,A,B,C
0,0,1,2
2,6,7,8
4,12,13,14


In [88]:
display(df1[df1["B"] % 2 == 0])

Unnamed: 0,A,B,C
1,3,4,5
3,9,10,11


In [89]:
display(df1[df1["C"].isin([5,8,11])])

Unnamed: 0,A,B,C
1,3,4,5
2,6,7,8
3,9,10,11


In [90]:
display(df1[(df1["A"] > 3) & (df1["C"].isin([5,8,11]))])

Unnamed: 0,A,B,C
2,6,7,8
3,9,10,11


In [127]:
# Assigning values to DataFrames
df1["B"] = [True,True,False,False,False]
display(df1["B"].dtype)
display(df1)

dtype('bool')

Unnamed: 0,A,B,C
0,0,True,2
1,3,True,5
2,6,False,8
3,9,False,11
4,12,False,14


In [92]:
df1["D"] = np.arange(5,dtype=float)
df1["E"] = 1
df1.loc[1,"A"] = np.nan
df1.loc[2,"B"] = np.nan
display(df1)

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2,0.0,1
1,,1.0,5,1.0,1
2,6.0,,8,2.0,1
3,9.0,0.0,11,3.0,1
4,12.0,0.0,14,4.0,1


In [93]:
display(df1.isnull().values.any())

True

In [129]:
!pip install --upgrade pandas
# Dropping rows and columns in DataFrames
df1.drop(columns=["A","B"],inplace=True)
display(df1)

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-0.25.3-cp36-cp36m-win_amd64.whl (9.0 MB)
Collecting pytz>=2017.2
  Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)
Collecting python-dateutil>=2.6.1
  Downloading python_dateutil-2.8.1-py2.py3-none-any.whl (227 kB)
Installing collected packages: pytz, python-dateutil, pandas
Successfully installed pandas-0.25.3 python-dateutil-2.8.1 pytz-2020.1


ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

jupyterthemes 0.20.0 requires ipython>=5.4.1, but you'll have ipython 5.1.0 which is incompatible.
jupyterthemes 0.20.0 requires notebook>=5.6.0, but you'll have notebook 5.0.0 which is incompatible.


TypeError: drop() got an unexpected keyword argument 'columns'

In [146]:
# The following results in error:
# df2 = df1.drop(columns="A")
# but not this:
df2 = df1.drop(columns="A",errors="ignore")
display(df2)

TypeError: drop() got an unexpected keyword argument 'columns'

In [None]:
df3 = df1.drop(index=[2,3])
display(df3)

In [131]:
# Copying DataFrames
df = df1.copy()

In [132]:
df1.at[0,"D"] = 2.0
display(df1)

Unnamed: 0,A,B,C,D
0,0,True,2,2.0
1,3,True,5,
2,6,False,8,
3,9,False,11,
4,12,False,14,


In [133]:
df.at[0,"D"] = 3.0
display(df)

Unnamed: 0,A,B,C,D
0,0,True,2,3.0
1,3,True,5,
2,6,False,8,
3,9,False,11,
4,12,False,14,


In [134]:
display(df.at[0,"D"] == df1.at[0,"D"])

False

In [135]:
# Concatenating DataFrames
df = pd.DataFrame({"A": list("ababab"), "B":[0,0,0,1,1,1],"C": [10,20,30,40,50,60]})
display(df)

Unnamed: 0,A,B,C
0,a,0,10
1,b,0,20
2,a,0,30
3,b,1,40
4,a,1,50
5,b,1,60


In [136]:
display(pd.concat([df[3:],df[:3]]))

Unnamed: 0,A,B,C
3,b,1,40
4,a,1,50
5,b,1,60
0,a,0,10
1,b,0,20
2,a,0,30


In [137]:
# Merging DataFrames (SQL style)
df1 = pd.DataFrame({"LKey": list("abcdef"),"A":[0,0,0,1,1,1]})
display(df1)

Unnamed: 0,A,LKey
0,0,a
1,0,b
2,0,c
3,1,d
4,1,e
5,1,f


In [138]:
df2 = pd.DataFrame({"RKey": list("fedcba"),"B":[0,0,0,1,1,1]})
display(df2)

Unnamed: 0,B,RKey
0,0,f
1,0,e
2,0,d
3,1,c
4,1,b
5,1,a


In [139]:
display(df1.merge(df2,how="outer",left_on="LKey",right_on="RKey"))

Unnamed: 0,A,LKey,B,RKey
0,0,a,1,a
1,0,b,1,b
2,0,c,1,c
3,1,d,0,d
4,1,e,0,e
5,1,f,0,f


In [140]:
# Creating groupings
df = pd.DataFrame({"A": list("ababab"), "B":[0,0,0,1,1,1],"C": [10,20,30,40,50,60]})
display(df)

Unnamed: 0,A,B,C
0,a,0,10
1,b,0,20
2,a,0,30
3,b,1,40
4,a,1,50
5,b,1,60


In [143]:
g = df.groupby("A")
# display(g)
display(g.get_group("a"))

Unnamed: 0,A,B,C
0,a,0,10
2,a,0,30
4,a,1,50


In [144]:
display(g.sum())

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,90
b,2,120


In [149]:
g = df.groupby(["A","B"])
display(g.get_group(("a",0)))

Unnamed: 0,A,B,C
0,a,0,10
2,a,0,30


In [None]:
display(g.size())

In [150]:
# Defining and using categorical values
df = pd.DataFrame({"id":[1,2,3,4,5],"award":["silver", "gold", "silver","silver", "gold"]})
display(df)

Unnamed: 0,award,id
0,silver,1
1,gold,2
2,silver,3
3,silver,4
4,gold,5


In [153]:
df["award"] = df["award"].astype("category")
display(df["award"].cat.categories)
display(df)

Index(['gold', 'silver', 'bronze'], dtype='object')

Unnamed: 0,award,id
0,silver,1
1,gold,2
2,silver,3
3,silver,4
4,gold,5


In [154]:
df["award"] = df["award"].cat.set_categories(["gold","silver","bronze"])
display(df)
g = df.groupby("award").size()
display(g)

Unnamed: 0,award,id
0,silver,1
1,gold,2
2,silver,3
3,silver,4
4,gold,5


award
gold      2
silver    3
bronze    0
dtype: int64

In [155]:
display(g.get("iron",0))

0

In [156]:
# Reading and writing to comma-separated text (csv) files
df = pd.DataFrame({"id":[np.nan,2,3,4,5],"grade":[np.nan,"b",np.nan,"c","a"]})
display(df)

Unnamed: 0,grade,id
0,,
1,b,2.0
2,,3.0
3,c,4.0
4,a,5.0


In [157]:
df.to_csv("myfile.csv",index=False)
df2 = pd.read_csv("myfile.csv")
display(df2)

Unnamed: 0,grade,id
0,,
1,b,2.0
2,,3.0
3,c,4.0
4,a,5.0
