**Pandas Tutorial**

Pandas provides numerous tools to work with tabular data like you'd find in spreadsheets or databases. It is widely used for data preparation, cleaning, and analysis. It can work with a wide variety of data and provides many visualization options. It is built on top of NumPy.

Imports

In [1]:
import numpy as np
import pandas as pd
from numpy import random

# **Series**

In [2]:
# Pandas uses something called a dataframe. It is a 
# 2D data structure that can hold multiple data types.
# Columns have labels.

# Series are built on top of NumPy arrays. 
# Create a series by first creating a list
list_1 = ['a', 'b', 'c', 'd']
# I can define that I want the series indexes to be the
# provided labels
labels = [1, 2, 3, 4]
ser_1 = pd.Series(data=list_1, index=labels)
ser_1

1    a
2    b
3    c
4    d
dtype: object

In [3]:
# You can also add a NumPy array
arr_1 = np.array([1, 2, 3, 4])
arr_1

array([1, 2, 3, 4])

In [4]:
# Transform into series, it creates an index automatically
ser_2 = pd.Series(arr_1)
ser_2

0    1
1    2
2    3
3    4
dtype: int64

*Dictionary*

In [5]:
# You can quickly add labels and values with a dictionary
dict_1 = {"f_name": "Derek", "l_name": "Banas", "age": 44}
dict_1

{'f_name': 'Derek', 'l_name': 'Banas', 'age': 44}

In [6]:
ser_3 = pd.Series(dict_1)
ser_3

f_name    Derek
l_name    Banas
age          44
dtype: object

In [7]:
# Get data by label
ser_3['f_name']

'Derek'

In [8]:
# You can get the datatype
ser_1.dtype, ser_2.dtype, ser_3.dtype

(dtype('O'), dtype('int64'), dtype('O'))

## *Simple maths in Series*

In [9]:
# You can perform math operations on series
ser_2 + ser_2

0    2
1    4
2    6
3    8
dtype: int64

In [10]:
ser_2 - ser_2

0    0
1    0
2    0
3    0
dtype: int64

In [11]:
ser_2 * ser_2

0     1
1     4
2     9
3    16
dtype: int64

In [12]:
ser_2 / ser_2

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float64

In [13]:
np.exp(ser_2)

0     2.718282
1     7.389056
2    20.085537
3    54.598150
dtype: float64

In [14]:
# The difference between Series and ndarray is that operations
# align by labels

In [15]:
# Create a series from a dictionary
ser_4 = pd.Series({4: 5, 5: 6, 6: 7, 7: 8})
# If labels don't align you will get NaN
ser_2 + ser_4

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
dtype: float64

In [16]:
# You can assign names to series
ser_5 = pd.Series({8: 9, 9: 10}, name='rand_nums')
ser_5

8     9
9    10
Name: rand_nums, dtype: int64

In [17]:
ser_5.name

'rand_nums'

# DataFrames

DataFrames are the most commonly used data structure with Pandas. They are made up of multiple series that share the same index / label. They can contain multiple data types. They can be created from dicts, series, lists or other dataframes.

https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

## Creating DataFrames

In [18]:
# Create random matrix 2x3 with values between 10 and 50
arr_2 = np.random.randint(10, 50, size=(2, 3))
arr_2

array([[37, 42, 48],
       [37, 32, 25]])

In [19]:
# Create DF with data, row labels & column labels
df_1 = pd.DataFrame(arr_2, index = ['A', 'B'], columns=['C', 'D', 'E'] )
df_1

Unnamed: 0,C,D,E
A,37,42,48
B,37,32,25


In [20]:
df_1A = pd.DataFrame(arr_2, index = ['A', 'B'], 
                    columns=['C', 'D', 'E'],dtype = 'float')
df_1A

Unnamed: 0,C,D,E
A,37.0,42.0,48.0
B,37.0,32.0,25.0


In [21]:
# Create a DF from multiple series in a dict
# If series are of different lengthes extra spaces are NaN
dict_3 = {'one': pd.Series([1., 2., 3.], index = ['a', 'b', 'c']),
         'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
dict_3

{'one': a    1.0
 b    2.0
 c    3.0
 dtype: float64,
 'two': a    1.0
 b    2.0
 c    3.0
 d    4.0
 dtype: float64}

In [22]:
df_2 = pd.DataFrame(dict_3)
df_2

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [23]:
# from_dict accepts a column labels and lists
dict_exm = dict([('A', [1,2,3]), ('B', [4,5,6])])
dict_exm

{'A': [1, 2, 3], 'B': [4, 5, 6]}

In [24]:
pd.DataFrame.from_dict(dict_exm)

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [25]:
# You can assign the keys as row labels and column labels separate
# with orient='index'
pd.DataFrame.from_dict(dict([('A',[1,2,3]), ('B', [4,5,5])]), orient = 'index',
                      columns = ['one', 'two', 'three'])

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,5


In [26]:
# Get number of rows and columns as tuple
df_1.shape, df_2.shape

((2, 3), (4, 2))

In [27]:
# Multi Index DataFrames
df_mult_index = pd.DataFrame({"a" : [4, 5, 6],
                                "b" : [7, 8, 9],
                                "c" : [10, 11, 12]},
                               index = [1, 2, 3])
df_mult_index

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [28]:
df_mult_index_2 = pd.DataFrame({"a" : [4, 5, 6], "b" : [7, 8, 9],"c" : [10, 11, 12]},
                               index = pd.MultiIndex.from_tuples([('d', 1), 
                                                                  ('d', 2),
                                                                  ('e', 4)], 
                                                                 names=['n', 'v']))
df_mult_index_2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,4,6,9,12


# Editing and Retrieving Data

In [29]:
print(df_1)

    C   D   E
A  37  42  48
B  37  32  25


In [30]:
# Grab a **column**
print(df_1['C']) # Use 1 ['x']
print('--------')
print(df_1[['C', 'E']]) # Use 2 [['x', 'y']]

A    37
B    37
Name: C, dtype: int64
--------
    C   E
A  37  48
B  37  25


In [31]:
# Grabb a row as a series
print(df_1.loc['A'])
print('--------')
# Grab row by index position
df_1.iloc[1]

C    37
D    42
E    48
Name: A, dtype: int64
--------


C    37
D    32
E    25
Name: B, dtype: int64

In [32]:
# Grab cell with Row & Column
df_1.loc['A', 'C']

37

In [33]:
# Grab multiple cells by defining rows wanted & the
# columns from those rows
df_1.loc[['A', 'B'], ['C', 'E']]

Unnamed: 0,C,E
A,37,48
B,37,25


In [34]:
# Make new column
df_1['Total'] = df_1['C'] + df_1['D'] + df_1['E']
df_1

Unnamed: 0,C,D,E,Total
A,37,42,48,127
B,37,32,25,94


In [35]:
df_2

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [36]:
# You can perform multiple calculations
df_2['mult'] = df_2['one'] * df_2['two']
df_2

Unnamed: 0,one,two,mult
a,1.0,1.0,1.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0
d,,4.0,


In [37]:
# Make a new row by appending
dict_2 = {"C": 44, "D": 45, "E": 46}
new_row = pd.Series(dict_2, name = 'F')
df_1 = df_1.append(new_row)
df_1

Unnamed: 0,C,D,E,Total
A,37.0,42.0,48.0,127.0
B,37.0,32.0,25.0,94.0
F,44.0,45.0,46.0,


In [38]:
# Delete column (axis = 1) and set inplace to True which is required
# because Pandas tries to help you not delete data by accident
df_1.drop('Total', axis=1, inplace=True)
df_1

Unnamed: 0,C,D,E
A,37.0,42.0,48.0
B,37.0,32.0,25.0
F,44.0,45.0,46.0


In [39]:
# Delete a row (axis=0) -- it doesn't need to add axis = 0!
df_1.drop('B', axis = 0, inplace=True)
df_1

Unnamed: 0,C,D,E
A,37.0,42.0,48.0
F,44.0,45.0,46.0


In [40]:
# Create a new column and make it the index
df_1['Sex'] = ['Men', 'Women']
df_1.set_index('Sex', inplace=True)
df_1

Unnamed: 0_level_0,C,D,E
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Men,37.0,42.0,48.0
Women,44.0,45.0,46.0


In [41]:
# You can reset index values to numbers
df_1.reset_index(inplace=True)
df_1

Unnamed: 0,Sex,C,D,E
0,Men,37.0,42.0,48.0
1,Women,44.0,45.0,46.0


In [42]:
df_2

Unnamed: 0,one,two,mult
a,1.0,1.0,1.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0
d,,4.0,


In [43]:
# Assign can be used to create a column while leaving the
# original DF untouched
df_2.assign(div=df_2['one'] / df_2['two'])
df_2

Unnamed: 0,one,two,mult
a,1.0,1.0,1.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0
d,,4.0,


In [44]:
# You can pass in a function as well
df_2.assign(div=lambda x: (x['one'] / x['two']))
df_2

Unnamed: 0,one,two,mult
a,1.0,1.0,1.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0
d,,4.0,


In [45]:
# Combine DataFrames while keeping df_3 data unless
# there is a NaN value

In [46]:
df_3 = pd.DataFrame({'A': [1., np.nan, 3., np.nan]})
df_3

Unnamed: 0,A
0,1.0
1,
2,3.0
3,


In [47]:
df_4 = pd.DataFrame({'A': [8., 9., 2., 4.]})
df_4

Unnamed: 0,A
0,8.0
1,9.0
2,2.0
3,4.0


In [48]:
df_3new = df_3.combine_first(df_4)
df_3new

Unnamed: 0,A
0,1.0
1,9.0
2,3.0
3,4.0


In [49]:
df_3 = pd.DataFrame({'A': [1., np.nan, 3., np.nan]})
df_4 = pd.DataFrame({'A': [8., 9., 2., 4.]})
df_3.combine_first(df_4)

Unnamed: 0,A
0,1.0
1,9.0
2,3.0
3,4.0


In [50]:
# Compare columns
df_3.compare(df_4)

Unnamed: 0_level_0,A,A
Unnamed: 0_level_1,self,other
0,1.0,8.0
1,,9.0
2,3.0,2.0
3,,4.0


In [51]:
# Combine, using a function
df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
print(df1)
print('--------')
df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
print(df2)
print('--------')
df1.combine(df2, np.minimum)

   A  B
0  5  2
1  0  4
--------
   A  B
0  1  3
1  1  3
--------


Unnamed: 0,A,B
0,1,2
1,0,3


# Vector Functions

pandas provides a large set of vector functions that operate on all
columns of a DataFrame or a single selected column (a pandas
Series). These functions produce vectors of values for each of the
columns, or a single Series for the individual Series. Examples:

In [52]:
df_test = pd.DataFrame({'Length': pd.Series([1., 4., 3., 2.]),
                        'Height': pd.Series([1., 4., 3., 2.]),
                        'Depth': pd.Series([1., 4., 3., 2.])})
df_test

Unnamed: 0,Length,Height,Depth
0,1.0,1.0,1.0
1,4.0,4.0,4.0
2,3.0,3.0,3.0
3,2.0,2.0,2.0


In [53]:
# Compute and append one or more new columns.
df_test.assign(Area=lambda df_test: df_test.Length*df_test.Height)

Unnamed: 0,Length,Height,Depth,Area
0,1.0,1.0,1.0,1.0
1,4.0,4.0,4.0,16.0
2,3.0,3.0,3.0,9.0
3,2.0,2.0,2.0,4.0


In [54]:
# Add single column.
df_test['Volume'] = df_test.Length*df_test.Height*df_test.Depth
df_test

Unnamed: 0,Length,Height,Depth,Volume
0,1.0,1.0,1.0,1.0
1,4.0,4.0,4.0,64.0
2,3.0,3.0,3.0,27.0
3,2.0,2.0,2.0,8.0


In [55]:
# Bin column into n buckets.
# pd.qcut(df.col, n, labels=False)
pd.qcut(df_test['Volume'], 2, labels=['low', 'high'])

0     low
1    high
2    high
3     low
Name: Volume, dtype: category
Categories (2, object): ['low' < 'high']

In [56]:
df_test['Volume_bucket'] = pd.qcut(df_test['Volume'], 2, labels=['low', 'high'])
df_test

Unnamed: 0,Length,Height,Depth,Volume,Volume_bucket
0,1.0,1.0,1.0,1.0,low
1,4.0,4.0,4.0,64.0,high
2,3.0,3.0,3.0,27.0,high
3,2.0,2.0,2.0,8.0,low


In [57]:
# Element-wise max. Column max > axis =1 
df_test.max(axis=1)

  df_test.max(axis=1)


0     1.0
1    64.0
2    27.0
3     8.0
dtype: float64

In [58]:
# Element-wise min. Row min > axis = 0
df_test.min(axis = 0)

Length           1.0
Height           1.0
Depth            1.0
Volume           1.0
Volume_bucket    low
dtype: object

In [59]:
# Trim values at input thresholds
df_test['Volume_thersholds'] = df_test['Volume'].clip(lower=-10,upper=10)
df_test

Unnamed: 0,Length,Height,Depth,Volume,Volume_bucket,Volume_thersholds
0,1.0,1.0,1.0,1.0,low,1.0
1,4.0,4.0,4.0,64.0,high,10.0
2,3.0,3.0,3.0,27.0,high,10.0
3,2.0,2.0,2.0,8.0,low,8.0


# Conditional Selection

In [60]:
# Dataset start: Matrix 2x3 using random numbers from 10 to 50
arr_2 = np.random.randint(10, 50, size=(2, 3))
arr_2

array([[21, 39, 42],
       [22, 47, 43]])

In [61]:
df_1 = pd.DataFrame(arr_2, ['A', 'B'], ['C', 'D', 'E'])
print(df_1)

    C   D   E
A  21  39  42
B  22  47  43


In [62]:
# You can use conditional operators to retrieve a table
# based on the condition
print("Greater than 40\n", df_1 > 40.0)

Greater than 40
        C      D     E
A  False  False  True
B  False   True  True


In [63]:
# You can use comparison operater functions as well like
# gt (greater than), ge (Greater than or equal),
# lt (lower than), le (Lower than or equal) , 
# eq (Equal to), ne (not equal to)
print("Greater than 45\n", df_1.gt(45.0))

Greater than 45
        C      D      E
A  False  False  False
B  False   True  False


In [64]:
print("Lower than 45\n", df_1.lt(45.0))

Lower than 45
       C      D     E
A  True   True  True
B  True  False  True


In [65]:
print("Greater than 49\n", df_1.ge(49.0))

Greater than 49
        C      D      E
A  False  False  False
B  False  False  False


In [66]:
print("Equal to 38\n", df_1.eq(38.0))

Equal to 38
        C      D      E
A  False  False  False
B  False  False  False


In [67]:
print("Not equal to 38\n", df_1.ne(38.0))

Not equal to 38
       C     D     E
A  True  True  True
B  True  True  True


In [68]:
# You can place conditions in brackets as well
bool_1 = df_1 >= 45.0
df_1[bool_1]

Unnamed: 0,C,D,E
A,,,
B,,47.0,


In [69]:
# Get bools for a column
df_1['E'] > 40

A    True
B    True
Name: E, dtype: bool

In [70]:
# Return a row if cell value in column matches a condition
df_1[df_1['E']>30]

Unnamed: 0,C,D,E
A,21,39,42
B,22,47,43


In [71]:
# You can focus on a column based on resulting dataframe
df_2 = df_1[df_1['E']>30]
df_2['C']

A    21
B    22
Name: C, dtype: int64

In [72]:
# You can stack these commands
print(df_1[df_1['E']>20]['C'])
print()

A    21
B    22
Name: C, dtype: int64



In [73]:
# You can also grab multiple columns
print(df_1[df_1['E']>20][['C', 'D']])

    C   D
A  21  39
B  22  47


In [74]:
# You can use multiple conditions
arr_3 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df_2 = pd.DataFrame(arr_3, index = ['A', 'B', 'C'], columns= ['X', 'Y', 'Z'])
print(df_2, "\n")

   X  Y  Z
A  1  2  3
B  4  5  6
C  7  8  9 



In [75]:
# You can use or | to combine conditions as well
df_2[(df_2['X']>3) & (df_2['X']<7)]

Unnamed: 0,X,Y,Z
B,4,5,6


# File Input / Output

Pandas can work with the following types of data : CSV, Plain Text, JSON, XML, PDF, SQL, HTML, XLSX, DOCX, ZIP, Images Hierarchical Data Format, MP3, and MP4.

In [80]:
# Read a CSV file
# Type pd.read_ [TAB] to see the file types you can read
pb_df = pd.read_csv('../raw_data/phone_book.csv')
pb_df

Unnamed: 0,first_name,last_name,phone_number
0,John,Lennon,123
1,George,Harrisson,456
2,Ringo,Starr,789


In [84]:
spotify_df = pd.read_csv('../raw_data/spotify_2017.csv')

## Subset Observations - rows

In [91]:
# Get info of dataset
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                100 non-null    object 
 1   name              100 non-null    object 
 2   artists           100 non-null    object 
 3   danceability      100 non-null    float64
 4   energy            100 non-null    float64
 5   key               100 non-null    float64
 6   loudness          100 non-null    float64
 7   mode              100 non-null    float64
 8   speechiness       100 non-null    float64
 9   acousticness      100 non-null    float64
 10  instrumentalness  100 non-null    float64
 11  liveness          100 non-null    float64
 12  valence           100 non-null    float64
 13  tempo             100 non-null    float64
 14  duration_ms       100 non-null    float64
 15  time_signature    100 non-null    float64
dtypes: float64(13), object(3)
memory usage: 12.6+

In [93]:
spotify_df.shape

(100, 16)

In [86]:
# Display 1st 5 rows
spotify_df.head()

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7qiZfU4dY1lWllzX7mPBI,Shape of You,Ed Sheeran,0.825,0.652,1.0,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,233713.0,4.0
1,5CtI0qwDJkDQGwXD1H1cL,Despacito - Remix,Luis Fonsi,0.694,0.815,2.0,-4.328,1.0,0.12,0.229,0.0,0.0924,0.813,88.931,228827.0,4.0
2,4aWmUDTfIPGksMNLV2rQP,Despacito (Featuring Daddy Yankee),Luis Fonsi,0.66,0.786,2.0,-4.757,1.0,0.17,0.209,0.0,0.112,0.846,177.833,228200.0,4.0
3,6RUKPb4LETWmmr3iAEQkt,Something Just Like This,The Chainsmokers,0.617,0.635,11.0,-6.769,0.0,0.0317,0.0498,1.4e-05,0.164,0.446,103.019,247160.0,4.0
4,3DXncPQOG4VBw3QHh3S81,I'm the One,DJ Khaled,0.609,0.668,7.0,-4.284,1.0,0.0367,0.0552,0.0,0.167,0.811,80.924,288600.0,4.0


In [87]:
# Display last 5 rows
spotify_df.tail()

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
95,1PSBzsahR2AKwLJgx8ehB,Bad Things (with Camila Cabello),Machine Gun Kelly,0.675,0.69,2.0,-4.761,1.0,0.132,0.21,0.0,0.287,0.272,137.817,239293.0,4.0
96,0QsvXIfqM0zZoerQfsI9l,Don't Let Me Down,The Chainsmokers,0.542,0.859,11.0,-5.651,1.0,0.197,0.16,0.00466,0.137,0.403,159.797,208053.0,4.0
97,7mldq42yDuxiUNn08nvzH,Body Like A Back Road,Sam Hunt,0.731,0.469,5.0,-7.226,1.0,0.0326,0.463,1e-06,0.103,0.631,98.963,165387.0,4.0
98,7i2DJ88J7jQ8K7zqFX2fW,Now Or Never,Halsey,0.658,0.588,6.0,-4.902,0.0,0.0367,0.105,1e-06,0.125,0.434,110.075,214802.0,4.0
99,1j4kHkkpqZRBwE0A4CN4Y,Dusk Till Dawn - Radio Edit,ZAYN,0.258,0.437,11.0,-6.593,0.0,0.039,0.101,1e-06,0.106,0.0967,180.043,239000.0,4.0


In [88]:
# Get 1st 2 rows
spotify_df[:2]

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7qiZfU4dY1lWllzX7mPBI,Shape of You,Ed Sheeran,0.825,0.652,1.0,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,233713.0,4.0
1,5CtI0qwDJkDQGwXD1H1cL,Despacito - Remix,Luis Fonsi,0.694,0.815,2.0,-4.328,1.0,0.12,0.229,0.0,0.0924,0.813,88.931,228827.0,4.0


In [89]:
# Get 1st through 5 with a 2 step
spotify_df[:5:2]

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7qiZfU4dY1lWllzX7mPBI,Shape of You,Ed Sheeran,0.825,0.652,1.0,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,233713.0,4.0
2,4aWmUDTfIPGksMNLV2rQP,Despacito (Featuring Daddy Yankee),Luis Fonsi,0.66,0.786,2.0,-4.757,1.0,0.17,0.209,0.0,0.112,0.846,177.833,228200.0,4.0
4,3DXncPQOG4VBw3QHh3S81,I'm the One,DJ Khaled,0.609,0.668,7.0,-4.284,1.0,0.0367,0.0552,0.0,0.167,0.811,80.924,288600.0,4.0


In [90]:
# Get indexes
spotify_df.index.array

<PandasArray>
[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
 95, 96, 97, 98, 99]
Length: 100, dtype: int64

In [95]:
# Get NumPy array
spotify_df['name'].to_numpy()

array(['Shape of You', 'Despacito - Remix',
       'Despacito (Featuring Daddy Yankee)', 'Something Just Like This',
       "I'm the One", 'HUMBLE.', "It Ain't Me (with Selena Gomez)",
       'Unforgettable', "That's What I Like",
       'I Don’t Wanna Live Forever (Fifty Shades Darker) - From "Fifty Shades Darker (Original Motion Picture Soundtrack)"',
       'XO TOUR Llif3', 'Paris', 'Stay (with Alessia Cara)', 'Attention',
       'Mask Off', 'Congratulations',
       'Swalla (feat. Nicki Minaj & Ty Dolla $ign)', 'Castle on the Hill',
       'Rockabye (feat. Sean Paul & Anne-Marie)', 'Believer', 'Mi Gente',
       'Thunder', "Say You Won't Let Go",
       "There's Nothing Holdin' Me Back", 'Me Rehúso', 'Issues',
       'Galway Girl', 'Scared to Be Lonely', 'Closer',
       'Symphony (feat. Zara Larsson)', 'I Feel It Coming', 'Starboy',
       'Wild Thoughts', 'Slide', 'New Rules', '1-800-273-8255',
       'Passionfruit', 'rockstar', 'Strip That Down',
       '2U (feat. Justin Bieber)

In [109]:
# Randomly select fraction of rows. 
spotify_df.sample(frac=0.1)

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
4,3DXncPQOG4VBw3QHh3S81,I'm the One,DJ Khaled,0.609,0.668,7.0,-4.284,1.0,0.0367,0.0552,0.0,0.167,0.811,80.924,288600.0,4.0
60,6HZILIRieu8S0iqY8kIKh,DNA.,Kendrick Lamar,0.637,0.514,1.0,-6.763,1.0,0.365,0.0047,0.0,0.094,0.402,139.931,185947.0,4.0
38,6EpRaXYhGOB3fj4V2uDkM,Strip That Down,Liam Payne,0.869,0.485,6.0,-5.595,1.0,0.0545,0.246,0.0,0.0765,0.527,106.028,204502.0,4.0
33,7tr2za8SQg2CI8EDgrdtN,Slide,Calvin Harris,0.736,0.795,1.0,-3.299,0.0,0.0545,0.498,1e-06,0.254,0.511,104.066,230813.0,4.0
29,1x5sYLZiu9r5E43kMlt9f,Symphony (feat. Zara Larsson),Clean Bandit,0.707,0.629,0.0,-4.581,0.0,0.0563,0.259,1.6e-05,0.138,0.457,122.863,212459.0,4.0
53,3PEgB3fkiojxms35ntsTg,More Than You Know,Axwell /\ Ingrosso,0.644,0.743,5.0,-5.002,0.0,0.0355,0.034,0.0,0.257,0.544,123.074,203000.0,4.0
36,7hDc8b7IXETo14hHIHdnh,Passionfruit,Drake,0.809,0.463,11.0,-11.377,1.0,0.0396,0.256,0.085,0.109,0.364,111.98,298941.0,4.0
94,2fQrGHiQOvpL9UgPvtYy6,Bank Account,21 Savage,0.884,0.346,8.0,-8.228,0.0,0.351,0.0151,7e-06,0.0871,0.376,75.016,220307.0,4.0
39,3A7qX2QjDlPnazUsRk5y0,2U (feat. Justin Bieber),David Guetta,0.548,0.65,8.0,-5.827,0.0,0.0591,0.219,0.0,0.225,0.557,144.937,194897.0,4.0
78,6uFsE1JgZ20EXyU0JQZbU,Look What You Made Me Do,Taylor Swift,0.773,0.68,9.0,-6.378,0.0,0.141,0.213,1.6e-05,0.122,0.497,128.062,211859.0,4.0


In [110]:
# Randomly select n rows.
spotify_df.sample(n=8)

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
15,3a1lNhkSLSkpJE4MSHpDu,Congratulations,Post Malone,0.627,0.812,6.0,-4.215,1.0,0.0358,0.198,0.0,0.212,0.504,123.071,220293.0,4.0
19,0CcQNd8CINkwQfe1RDtGV,Believer,Imagine Dragons,0.779,0.787,10.0,-4.305,0.0,0.108,0.0524,0.0,0.14,0.708,124.982,204347.0,4.0
25,6D0b04NJIKfEMg040WioJ,Issues,Julia Michaels,0.706,0.427,8.0,-6.864,1.0,0.0879,0.413,0.0,0.0609,0.42,113.804,176320.0,4.0
28,7BKLCZ1jbUBVqRi2FVlTV,Closer,The Chainsmokers,0.748,0.524,8.0,-5.599,1.0,0.0338,0.414,0.0,0.111,0.661,95.01,244960.0,4.0
83,4pLwZjInHj3SimIyN9SnO,Side To Side,Ariana Grande,0.648,0.738,6.0,-5.883,0.0,0.247,0.0408,0.0,0.292,0.603,159.145,226160.0,4.0
85,1wjzFQodRWrPcQ0AnYnvQ,I Like Me Better,Lauv,0.752,0.505,9.0,-7.621,1.0,0.253,0.535,3e-06,0.104,0.419,91.97,197437.0,4.0
38,6EpRaXYhGOB3fj4V2uDkM,Strip That Down,Liam Payne,0.869,0.485,6.0,-5.595,1.0,0.0545,0.246,0.0,0.0765,0.527,106.028,204502.0,4.0
32,1OAh8uOEOvTDqkKFsKksC,Wild Thoughts,DJ Khaled,0.671,0.672,0.0,-3.094,0.0,0.0688,0.0329,0.0,0.118,0.632,97.98,204173.0,4.0


In [112]:
# Select and order top n entries.
spotify_df.nlargest(2, 'energy')

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
77,3EmmCZoqpWOTY1g2GBwJo,Just Hold On,Steve Aoki,0.647,0.932,11.0,-3.515,1.0,0.0824,0.00383,2e-06,0.0574,0.374,114.991,198774.0,4.0
26,0afhq8XCExXpqazXczTSv,Galway Girl,Ed Sheeran,0.624,0.876,9.0,-3.374,1.0,0.1,0.0735,0.0,0.327,0.781,99.943,170827.0,4.0


In [114]:
# Select and order bottom n entries.
spotify_df.nsmallest(2, 'energy')

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
94,2fQrGHiQOvpL9UgPvtYy6,Bank Account,21 Savage,0.884,0.346,8.0,-8.228,0.0,0.351,0.0151,7e-06,0.0871,0.376,75.016,220307.0,4.0
58,3kxfsdsCpFgN412fpnW85,Redbone,Childish Gambino,0.743,0.359,1.0,-10.401,1.0,0.0794,0.199,0.00611,0.137,0.587,160.083,326933.0,4.0


## Subset Variables - columns

In [116]:
# Select single column with specific name.
spotify_df['name']

0                           Shape of You
1                      Despacito - Remix
2     Despacito (Featuring Daddy Yankee)
3               Something Just Like This
4                            I'm the One
                     ...                
95      Bad Things (with Camila Cabello)
96                     Don't Let Me Down
97                 Body Like A Back Road
98                          Now Or Never
99           Dusk Till Dawn - Radio Edit
Name: name, Length: 100, dtype: object

In [115]:
# Select multiple columns with specific names.
spotify_df[['name','artists']]

Unnamed: 0,name,artists
0,Shape of You,Ed Sheeran
1,Despacito - Remix,Luis Fonsi
2,Despacito (Featuring Daddy Yankee),Luis Fonsi
3,Something Just Like This,The Chainsmokers
4,I'm the One,DJ Khaled
...,...,...
95,Bad Things (with Camila Cabello),Machine Gun Kelly
96,Don't Let Me Down,The Chainsmokers
97,Body Like A Back Road,Sam Hunt
98,Now Or Never,Halsey


In [132]:
# Select columns whose name matches regular expression regex.
spotify_df.filter(regex = 'e$', axis = 1)

Unnamed: 0,name,mode,valence,time_signature
0,Shape of You,0.0,0.9310,4.0
1,Despacito - Remix,1.0,0.8130,4.0
2,Despacito (Featuring Daddy Yankee),1.0,0.8460,4.0
3,Something Just Like This,0.0,0.4460,4.0
4,I'm the One,1.0,0.8110,4.0
...,...,...,...,...
95,Bad Things (with Camila Cabello),1.0,0.2720,4.0
96,Don't Let Me Down,1.0,0.4030,4.0
97,Body Like A Back Road,1.0,0.6310,4.0
98,Now Or Never,0.0,0.4340,4.0


In [133]:
# Filter a string name
spotify_df[spotify_df['artists'].str.contains('Luis Fonsi')]

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
1,5CtI0qwDJkDQGwXD1H1cL,Despacito - Remix,Luis Fonsi,0.694,0.815,2.0,-4.328,1.0,0.12,0.229,0.0,0.0924,0.813,88.931,228827.0,4.0
2,4aWmUDTfIPGksMNLV2rQP,Despacito (Featuring Daddy Yankee),Luis Fonsi,0.66,0.786,2.0,-4.757,1.0,0.17,0.209,0.0,0.112,0.846,177.833,228200.0,4.0


# Basics & Math

In [102]:
dict_3 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
         'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df_2 = pd.DataFrame(dict_3)
df_2

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [103]:
# You can replace NaN values with 0 or anything else
print(df_2.fillna(0))

   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  0.0  4.0


In [135]:
# Get values in row 2
row = df_2.iloc[1]
row

one    2.0
two    2.0
Name: b, dtype: float64

In [136]:
# You can do the same with sub, mul, and div
df_2.add(row, axis='columns')

Unnamed: 0,one,two
a,3.0,3.0
b,4.0,4.0
c,5.0,5.0
d,,6.0


In [137]:
# Get column 2
col = df_2['two']
# Subtract from other columns
df_2.sub(col, axis=0)

Unnamed: 0,one,two
a,0.0,0.0
b,0.0,0.0
c,0.0,0.0
d,,0.0


In [138]:
# Check if empty
df_2.empty

False

In [139]:
# Transform executes a function on a dataframe
df_5 = pd.DataFrame({'A': range(3), 'B': range(1, 4)})
df_5

Unnamed: 0,A,B
0,0,1
1,1,2
2,2,3


In [140]:
df_5.transform(lambda x: x+1)

Unnamed: 0,A,B
0,1,2
1,2,3
2,3,4


In [141]:
df_5.transform(lambda x: x**2)

Unnamed: 0,A,B
0,0,1
1,1,4
2,4,9


In [142]:
df_5.transform(lambda x: np.sqrt(x))

Unnamed: 0,A,B
0,0.0,1.0
1,1.0,1.414214
2,1.414214,1.732051


In [149]:
# Passing a dictionary allows you to perform different calculations
# on different columns
df_5.transform({'A': lambda x: x+10, 'B': lambda x: x**3})

Unnamed: 0,A,B
0,10,1
1,11,8
2,12,27


In [150]:
# map performs a function on a series
df_5['A'].map(lambda x: x**2)

0    0
1    1
2    4
Name: A, dtype: int64

In [151]:
# applymap does the same on a dataframe
df_5.applymap(lambda x: x**2)

Unnamed: 0,A,B
0,0,1
1,1,4
2,4,9


In [152]:
# Get unique values in column 2 of DF
df_2['two'].unique()

array([1., 2., 3., 4.])

In [153]:
# Get number of uniques
df_2['two'].nunique()

4

In [154]:
# Get the number of times each value showed in column 2
df_2['two'].value_counts()

1.0    1
2.0    1
3.0    1
4.0    1
Name: two, dtype: int64

In [155]:
# Get column names
df_2.columns

Index(['one', 'two'], dtype='object')

In [156]:
# Get index info
df_2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [157]:
# Return a DF that lists null values as True
df_2.isnull()

Unnamed: 0,one,two
a,False,False
b,False,False
c,False,False
d,True,False
