In [1]:
import pandas as pd
na_vals = ['NA','N/A','Missing']
df = pd.read_csv('C:/Users/Lenovo/OneDrive/Desktop/LIBRARIES/PANDAS/WORKING MODULES/survey_results_public.csv', na_values = na_vals)
schema_df = pd.read_csv('C:/Users/Lenovo/OneDrive/Desktop/LIBRARIES/PANDAS/WORKING MODULES/survey_results_schema.csv', na_values = na_vals)

In [2]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [3]:
import numpy as np

In [4]:
df.set_index('Respondent', inplace = True)

In [5]:
schema_df.set_index('Column', inplace = True)

In [6]:
people = {
    'first':['Corey','Jane','John','Chris',np.nan,None,'NA'],
    'last':['Schafer','Doe','Doe','Schafer',np.nan,np.nan,'Missing'],
    'email':['Corey.Schafer@gmail.com','Jane.Doe@gmail.com','John.Doe@gmail.com',None,np.nan,'N/A','Missing'],
    'age':['36','55','63','36',None,None,'Missing']
}

people_df = pd.DataFrame(people)


people_df.replace({'Missing':np.nan,'NA':np.nan,'N/A':np.nan}, inplace=True)

In [7]:
# isna()  returns a df whether it treats those values as a NaN or not

people_df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,True,True
6,True,True,True,True


None and NaN are treated as missing Values

In [8]:
people_df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@gmail.com,36.0
1,Jane,Doe,Jane.Doe@gmail.com,55.0
2,John,Doe,John.Doe@gmail.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,,
6,,,,


In [9]:
people_df.dropna(axis = 'index', how = 'any') #deletes those rows that have some values

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@gmail.com,36
1,Jane,Doe,Jane.Doe@gmail.com,55
2,John,Doe,John.Doe@gmail.com,63


In [10]:
people_df.dropna(axis = 'index', how = 'all')  #all deletes those rows that have all missing values

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@gmail.com,36
1,Jane,Doe,Jane.Doe@gmail.com,55
2,John,Doe,John.Doe@gmail.com,63
3,Chris,Schafer,,36


In [11]:
people_df.dropna(axis = 'columns', how = 'all') #drop columns if all of the column has a missing data

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@gmail.com,36.0
1,Jane,Doe,Jane.Doe@gmail.com,55.0
2,John,Doe,John.Doe@gmail.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,,
6,,,,


In [12]:
people_df.dropna(axis = 'columns', how = 'any')

0
1
2
3
4
5
6


In [13]:
#drop rows that have missing values in a specific column
#check specific columns, we need to pass a subset arguement

people_df.dropna(axis = 'index', how = 'any', subset = ['email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@gmail.com,36
1,Jane,Doe,Jane.Doe@gmail.com,55
2,John,Doe,John.Doe@gmail.com,63


In [14]:
#since 'how' is set to all if both email and last are missing then it will dropthose rows
# if 'how' was sent to any then it will treat it as an 'or' operator, and will drop rows if either of them is missing 

people_df.dropna(axis = 'index', how = 'all', subset = ['email','last'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@gmail.com,36
1,Jane,Doe,Jane.Doe@gmail.com,55
2,John,Doe,John.Doe@gmail.com,63
3,Chris,Schafer,,36


In [15]:
#treating custom missing values
#  2 ->  Ways
# Since we created our custom data Frame we can just replace those values
#back where we created the original dataFrame


people_df.dropna(axis = 'index', how = 'all', subset = ['email','last'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@gmail.com,36
1,Jane,Doe,Jane.Doe@gmail.com,55
2,John,Doe,John.Doe@gmail.com,63
3,Chris,Schafer,,36


In [16]:
# fillna('<string>')  it is a method to fill missing values in a dataframe

# people_df.fillna('this is Missing')
people_df.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@gmail.com,36
1,Jane,Doe,Jane.Doe@gmail.com,55
2,John,Doe,John.Doe@gmail.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,0,0
6,0,0,0,0


In [17]:
#casting DataTypes
#if we want strings to int to perform some sort of mathematical calculation
#then we need to convert missing values to float
# as missing values are of float dataType

print(type(np.nan))
print(type(None))



<class 'float'>
<class 'NoneType'>


In [23]:
people_df['age']

0    36.0
1    55.0
2    63.0
3    36.0
4     NaN
5     NaN
6     NaN
Name: age, dtype: float64

In [18]:
#converting data Type in pandas dataFrame

# people_df['age'] = people_df['age'].astype(int)
people_df['age'] = people_df['age'].astype(float)

In [19]:
people_df['age']

0    36.0
1    55.0
2    63.0
3    36.0
4     NaN
5     NaN
6     NaN
Name: age, dtype: float64

In [20]:
people_df['age'].mean()

47.5

In [21]:
#taking care of missing values when loading in a csv file
#then we can pass in a list of values during loading that we want to be treated as missing

df['YearsCode'].head(10)

Respondent
1       4
2     NaN
3       3
4       3
5      16
6      13
7       6
8       8
9      12
10     12
Name: YearsCode, dtype: object

In [22]:
df['YearsCode'] = df['YearsCode'].astype(float)

ValueError: could not convert string to float: 'Less than 1 year'

In [None]:
df['YearsCode'].unique()  #to get all unique values in a dataset

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [None]:
df['YearsCode'].replace({'Less than 1 year':0,'More than 50 years':51}, inplace = True)

In [None]:
df['YearsCode'] = df['YearsCode'].astype(float)

In [None]:
df['YearsCode'].mean()

11.662114216834588