In [6]:
import pandas as pd
import numpy as np

people = {
    "first": ["Corey", "Jane", "John", "Chris", np.nan, None, "NA"],
    "last": ["Schafer", "Doe", "Doe", "Schafer", np.nan, np.nan, "Missing"],
    "email": [
        "CoreyMSchafer@gmail.com",
        "JaneDoe@gmail.com",
        "JohnDoe@gmail.com",
        None,
        np.nan,
        "Anon@email.com",
        "NA",
    ],
    "age": ["33", "55", "63", "36", None, None, "Missing"],
}
df = pd.DataFrame(people)
df


Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anon@email.com,
6,,Missing,,Missing


In [7]:
df.dropna()  # drop rows with any missing values
# default is axis='index', how='any', 'any' means drop rows with any missing value

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
6,,Missing,,Missing


In [8]:
df.dropna(how="all")  # drop rows with all missing values

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
5,,,Anon@email.com,
6,,Missing,,Missing


In [10]:
df.dropna(axis="columns", how="any")  # drop columns with any missing values

0
1
2
3
4
5
6


In [12]:
df.dropna(subset=["email"])
# drop rows where email is missing

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
5,,,Anon@email.com,
6,,Missing,,Missing


In [13]:
df.dropna(subset=["last", "email"])
# drop rows where last OR email is missing

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
6,,Missing,,Missing


In [14]:
df.dropna(how="all", subset=["last", "email"])
# drop rows where last AND email is missing


Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
5,,,Anon@email.com,
6,,Missing,,Missing


# Custom Missing Values


In [18]:
df.replace(
    {"Missing": np.nan, "NA": np.nan}, inplace=True
)  # replace 'Missing' and 'NA' with NaN
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@gmail.com,55.0
2,John,Doe,JohnDoe@gmail.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anon@email.com,
6,,,,


In [19]:
df.isna()  # return a dataframe of boolean values, True if missing, False if not missing

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [20]:
df.fillna("MISSING")  # fill missing values with 'MISSING'

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anon@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


# DATA CASTING


In [24]:
df2 = df.dropna(
    subset=["age"]
)  # because Nan is a float, so the age column is float, and it cant be converted to int will give error
df2["age"] = df2["age"].astype(int)
df2["age"].mean()  # 46.75

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["age"] = df2["age"].astype(int)


46.75

In [25]:
df["age"] = df["age"].astype("float")
df["age"].mean()


46.75

In [26]:
na_vals = ["NA", "Missing"]
df = pd.read_csv(
    "data/survey_results_public.csv", index_col="Respondent", na_values=na_vals
)
df.head(2)


Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult


In [31]:
df["YearsCode"].replace({"Less than 1 year": 0, "More than 50 years": 51}, inplace=True)
df["YearsCode"].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 0, '30', '9', '26', '40', '19', '15', '20', '28',
       '25', '1', '22', '11', '33', '50', '41', '18', '34', '24', '23',
       '42', '27', '21', '36', '32', '39', '38', '31', '37', 51, '29',
       '44', '45', '48', '46', '43', '47', '49'], dtype=object)

In [32]:
df["YearsCode"] = df["YearsCode"].astype(float)
df["YearsCode"].mean()

11.662114216834588

In [33]:
df["YearsCode"].median()

9.0