In [None]:
# sometimes we can write more than one instruction in one cell, as below

# this instruction is used to import the library pandas so we can use it later
# "as pd" means that we can use the term pd to use the library instead of typing pandas
import pandas as pd

# .read_csv is an input method used to read csv files (datasets)
# remember we use pd instead of pandas 
# Also make sure that both files (the notebook and the dataset) are in the same folder
# otherwise, you will need to provide the full path instead of just the name of the dataset
salaries = pd.read_csv("Salaries.csv", low_memory=False)

# this will allow to create a new subdataset, but with a limited number of instances
# for example, here we are selecting all instance where the salary has been recorded in 2014
# .Year == 2014
Salaries_2014 = salaries[salaries.Year == 2014]
Salaries_2014.head()

In [None]:
# .isnull allows to check the null values in the dataset
# but it will display this for each and every value, which might not be very useful
Salaries_2014.isnull()

In [None]:
# isnull().any() will display this for each feature
Salaries_2014.isnull().any()

In [None]:
# .isnull().any().any() will display this for all the dataset
Salaries_2014.isnull().any().any()

In [None]:
# sometimes, we might not be only interested in the presence or not of the null values
# but we need to know how many null values we have, so we can calculate the sum
Salaries_2014.isnull().sum()

In [None]:
# If a feature is not important for any reason (either not useful, or contains only null values, etc.)
# we can simply drop the feature itself
Salaries_2014 = Salaries_2014.drop("Notes", axis=1)
Salaries_2014

In [None]:
# Also we drop instances that has null values using the method .dropna()
Salaries_2014 = Salaries_2014.dropna()
Salaries_2014