In [3]:
# continution for video5 part1

# pasting the previous video people dictionary data

people = {
    "first": ['srinu','sow','abc'],
    "last" : ['balireddy','yalla','abc'],
    "email": ['abc@abc.com','kbc@kbc.com','abc@abc.com']
}

import pandas as pd
df = pd.DataFrame(people)
df

# there are 4 popular methods to update data in pandas

# apply
# map
# applymap
# replace

Unnamed: 0,first,last,email
0,srinu,balireddy,abc@abc.com
1,sow,yalla,kbc@kbc.com
2,abc,abc,abc@abc.com


In [4]:
# apply method

"""
it is used for calling a function on our values and apply can work on either 
a df or a series object. The behaviour might be different for each of those
different objects.
"""

# lets look how apply works on a series.
# it can apply a function to every value in our series.
# lets say we want to see the length of all our email addresses.

df['email'].apply(len)

# the result says, first rows email id len is 11 and so on and so forth


0    11
1    11
2    11
Name: email, dtype: int64

In [5]:
# updating the values using apply method
# for this first create a quick function that takes a str input and make it uppercase
# a fucntion can be as complicated as it gets

def update_email(email):    # very basic function
    return email.upper()


df['email'].apply(update_email)   # ensure we are not putting () on the function
                                  # this is to ensure we are passing in the function itself 
                                  # and not the executed version of the function


0    ABC@ABC.COM
1    KBC@KBC.COM
2    ABC@ABC.COM
Name: email, dtype: object

In [7]:
# the above just give the series object as a result and 
# won't make the changes to the df.
# to apply changes to df, we can assign that to our column

df['email'] = df['email'].apply(update_email)
df

Unnamed: 0,first,last,email
0,srinu,balireddy,ABC@ABC.COM
1,sow,yalla,KBC@KBC.COM
2,abc,abc,ABC@ABC.COM


In [8]:
# we can also use lambda function as well

df['email']  = df['email'].apply(lambda x:x.lower())
df

Unnamed: 0,first,last,email
0,srinu,balireddy,abc@abc.com
1,sow,yalla,kbc@kbc.com
2,abc,abc,abc@abc.com


In [9]:
# until now we have used apply method on series objects
# lets use apply on df now

"""
when we used apply() on series, it ran a function on all the values in the series
when we use apply() on df, it runs a function on each row and column of that df 
"""

# example on series

df['email'].apply(len)   # this will give the len each emails in the series

# apply method on df

df.apply(len)

# here in the below result it is not applying the function to the every value in the df
# it's actually length function to each series in the df specifically the columns
# it is basically telling us that first name column has length 3 ( 3 values). no.of rows in each column

first    3
last     3
email    3
dtype: int64

In [10]:
# we can get the same result for the series if we manually check the len of one of these

len(df['email'])

# the below result is what it is doing when using apply on df
# but basically it is doing it for every column

3

In [11]:
# we can apply this to columns by changing the axis

df.apply(len, axis='columns')

# here it is caculating that row 0 has length 3( 3 values)

0    3
1    3
2    3
dtype: int64

In [12]:
# we can call min and max functions on the series of the df using apply

df.apply(pd.Series.min)

# here it runs min function on all the columns

first            abc
last             abc
email    abc@abc.com
dtype: object

In [13]:
# we can apply lambda methods as well on df

df.apply(lambda x: x.min())

first            abc
last             abc
email    abc@abc.com
dtype: object

In [14]:
"""
################ APPLY MAP #########################
"""

# running apply on a series applies the function to every value of the series
# running apply on a df, applies the function to every series in the df

# to apply function to every individual value  in the df, we use APPLYMAP

# APPLYMAP ONLY WORKS ON DF'S ONLY AND SERIES OBJECTS DON'T HAVE THE APPLYMAP METHOD

df.applymap(len)

# here it applies len function to each and individual values of the df

Unnamed: 0,first,last,email
0,5,9,11
1,3,5,11
2,3,3,11


In [29]:
# to make every value in the df as lower case

df.applymap(str.lower)    # as lower doesn't have any arguments here, we use that as an attribute


# here if the df has numeric values it will throw an error.
# for handling such scenarios we have to write custom function

Unnamed: 0,first,last,email
0,srinu,balireddy,abc@abc.com
1,sow,yalla,kbc@kbc.com
2,abc,abc,abc@abc.com


In [30]:
"""
###################### MAP ###########################
"""

# if we want to substitue some values in our df, we can use MAP method
# map accepts a dictionary of values we want to change

df['first'].map({'srinu': 'srinu','sow':'Sow'})


0    srinu
1      Sow
2      NaN
Name: first, dtype: object

In [31]:
"""
###################### REPLACE ###########################
"""

# in the above result the third value became NaN as only two values are passed to the map
# TO AVOID THAT WE WILL CAN USE REPLACE METHOD

df['first'].replace({'srinu': 'SRinu'})

0    SRinu
1      sow
2      abc
Name: first, dtype: object

In [52]:
# now lets go back to stackover flow dataset and manipulate the data using pandas.
# just consolidate the csv file reading code 


import pandas as pd

df        = pd.read_csv('data/survey_results_public.csv')
schema_df = pd.read_csv('data/survey_results_schema.csv')

pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 5)

In [45]:
# if we want to rename the convertedcomp column to a salaryusd in the df

df.rename(columns={'ConvertedComp':'SalaryUSD'}, inplace=True)

In [46]:
df['SalaryUSD']

0       NaN
         ..
88882   NaN
Name: SalaryUSD, Length: 88883, dtype: float64

In [53]:
# to change the hobbyist values from YES/NO to True/False

df['Hobbyist'] = df['Hobbyist'].map({'Yes':True,'No':False})

df['Hobbyist']

# remember when we use map and pass in values, anything outside of those values will result in NaN

0         True
1        False
         ...  
88881    False
88882     True
Name: Hobbyist, Length: 88883, dtype: bool