In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [4]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [5]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [7]:
df.dropna(axis = 'index', how ='all') # only drop a row(index) where all values are misssing # index 4 here

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [8]:
df.dropna(axis = 'index', how ='any') # drom a row(index) if any missing value exist in that row

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [9]:
df.dropna(axis = 'columns', how ='any')

0
1
2
3
4
5
6


In [10]:
df.dropna(axis = 'index', how ='any', subset =['email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [11]:
df.dropna(axis = 'index', how ='all', subset =['last','email']) # for each row to be droped both of the subset column must be missing

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [12]:
# handeling costum missing values 
df.replace('NA', np.nan, inplace = True)
df.replace('Missing', np.nan, inplace = True)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [14]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [13]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [15]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [16]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [18]:
# df['age'].mean() this will throw an error
# then we cast age column to float because we have NAN values in it
df['age'] = df['age'].astype(float)  

In [19]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [20]:
df['age'].mean()

46.75

# now trying it with the stackoverflow data 

In [23]:
na_vals = ['NA', 'Missing']
df = pd.read_csv('Data/stack-overflow-developer-survey-2019/survey_results_public.csv', index_col ='Respondent', na_values =na_vals)
schema_df = pd.read_csv('Data/stack-overflow-developer-survey-2019/survey_results_schema.csv', index_col = 'Column')

In [22]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [None]:
#to calcultat averae number of years that everyone has coding expereicne

In [24]:
df['YearsCode'].head(12)

Respondent
1       4
2     NaN
3       3
4       3
5      16
6      13
7       6
8       8
9      12
10     12
11      2
12      5
Name: YearsCode, dtype: object

In [30]:
# df['YearsCode'].mean() # will throw error
# then we convert
# df['YearsCode'] = df['YearsCode'].astype(float) # ERROR: could not convert string to float: 'Less than 1 year'
# so we check for unique values
df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 0, '30', '9', '26', '40', '19', '15', '20', '28',
       '25', '1', '22', '11', '33', '50', '41', '18', '34', '24', '23',
       '42', '27', '21', '36', '32', '39', '38', '31', '37',
       'More than 50 years', '29', '44', '45', '48', '46', '43', '47',
       '49'], dtype=object)

In [31]:
df['YearsCode'].replace('Less than 1 year', 0, inplace =True)
df['YearsCode'].replace('More than 50 years', 51, inplace =True)
df['YearsCode'] = df['YearsCode'].astype(float)

In [32]:
df['YearsCode'].mean()

11.662114216834588

In [33]:
df['YearsCode'].median()

9.0