In [40]:
import pandas as pd
import os

In [41]:
data = {'A' : [2,3,4,3], 'B' : [5,3,7,5]}

In [42]:
df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,2,5
1,3,3
2,4,7
3,3,5


#### apply() --- creating new col 
.apply() is used to apply a function along an axis of a DataFrame or on values in a Series.

In [43]:
# apply with custom function

In [45]:
def fun(df):
    return df['A'] + df['B']

df['row_sum'] = df.apply(fun, axis=1)
df

Unnamed: 0,A,B,row_sum
0,2,5,7
1,3,3,6
2,4,7,11
3,3,5,8


In [8]:
# apply with lambda

In [9]:
df['new_col'] = df['A'].apply(lambda x:x**2)

In [10]:
df

Unnamed: 0,A,B,row_sum,new_col
0,2,5,7,4
1,3,3,6,9
2,4,7,11,16
3,3,5,8,9


### -----------------------------------------------------------------------------------------------------------------------------

#### replace()

#### single value replacement

In [12]:
df.replace(3,10)                  #replace all the occurences of '3' with 10 in complete df

Unnamed: 0,A,B,row_sum,new_col
0,2,5,7,4
1,10,10,6,9
2,4,7,11,16
3,10,5,8,9


#### multiple value replacement

In [13]:
df.replace([3,2], 10)            #provide list of values to replace and a single element to replace with! 

Unnamed: 0,A,B,row_sum,new_col
0,10,5,7,4
1,10,10,6,9
2,4,7,11,16
3,10,5,8,9


#### replacement using dictionary

In [14]:
df.replace({11:111, 16:166, 7:777})

Unnamed: 0,A,B,row_sum,new_col
0,2,5,777,4
1,3,3,6,9
2,4,777,111,166
3,3,5,8,9


#### replacement in specific col

In [20]:
df1= pd.read_csv(os.path.join('..','sales.csv'), encoding='latin1')
df1.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [26]:
df1['Country'].replace(['United Kingdom', 'France'],'India', inplace=True)
df1.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,India
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,India
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,India
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,India
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,India


#### Replacement Using Regular Expressions

In [30]:
 df_str = pd.DataFrame({'col': ['apple', 'banapna', 'apricot']})
df_str

Unnamed: 0,col
0,apple
1,banapna
2,apricot


In [39]:
df_str.replace(r'^ap', 'new', regex=True)      # ^ ~ for checking only starting of string; if not applied, 'ap' anywhere in the string will be replaced

Unnamed: 0,col
0,newple
1,banapna
2,newricot


### -----------------------------------------------------------------------------------------------------------------------------

#### MAP

In [53]:
df['A'].map({3:'three', 2:'two'})                   #map always returns a new series.

0      two
1    three
2      NaN
3    three
Name: A, dtype: object

In [54]:
#Values not found as keys in the dictionary will result in NaN in the output Series unless na_action='ignore' is specified.

## pls check na_action parameter

#### another example of map function for more complex transformation

In [55]:
num = pd.Series([10, 25, 5, 40])
num

0    10
1    25
2     5
3    40
dtype: int64

In [56]:
def check(i):
    if i<15:
        return 'Small'
    elif i<30 and i>=15:
        return 'Medium'
    else:
        return 'Large'

In [57]:
num.map(check)

0     Small
1    Medium
2     Small
3     Large
dtype: object

### -----------------------------------------------------------------------------------------------------------------------------

#### Assign : used to add new columns to a DataFrame or modify existing ones.

In [73]:
df

Unnamed: 0,A,B,row_sum
0,2,5,7
1,3,3,6
2,4,7,11
3,3,5,8


In [69]:
df.assign(C= df['A'] + df['B'], D=100)           # creating new

Unnamed: 0,A,B,row_sum,C,D
0,2,5,7,7,100
1,3,3,6,6,100
2,4,7,11,11,100
3,3,5,8,8,100


In [75]:
df.assign(A = df['A']+1)                     # modifying existing

Unnamed: 0,A,B,row_sum
0,3,5,7
1,4,3,6
2,5,7,11
3,4,5,8


#### Note: apply, map, assign don't have inplace parameter; they always return new df/series

#### TASK

In [76]:
rect = pd.DataFrame({'length':[2,3,4], 'width':[3,4,5]})
rect

Unnamed: 0,length,width
0,2,3
1,3,4
2,4,5


#### use apply() and assign() to create new cols 'Area' and 'Perimeter' and return a dataframe

#### apply

In [84]:
def calculate_area(x):
    return x['length'] * x['width']

In [86]:
rect['Area'] = rect.apply(calculate_area, axis=1)
rect

Unnamed: 0,length,width,Area
0,2,3,6
1,3,4,12
2,4,5,20


#### assign

In [91]:
rect.assign(Perimeter = 2 * (rect['length'] + rect['width']))

Unnamed: 0,length,width,Area,Perimeter
0,2,3,6,10
1,3,4,12,14
2,4,5,20,18


In [92]:
rect

Unnamed: 0,length,width,Area
0,2,3,6
1,3,4,12
2,4,5,20


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [121]:
df

Unnamed: 0,A,B,row_sum
0,2,5,7
1,3,3,6
2,4,7,11
3,3,5,8


In [126]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [10, 20, 30]})
df

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [144]:
df.assign(df_row_products = df['A'] * df['B'])

Unnamed: 0,A,B,df_row_products
0,1,10,10
1,2,20,40
2,3,30,90


In [136]:
df_row_products

0    10
1    40
2    90
dtype: int64

In [137]:
df

Unnamed: 0,A,B,df_row_products
0,1,10,10
1,2,20,40
2,3,30,90
