# 7. Function Application

### Examples

In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load sample datasets
tips = sns.load_dataset('tips')
flights = sns.load_dataset('flights')
titanic = sns.load_dataset('titanic')
iris = sns.load_dataset('iris')

print("Datasets loaded:")
print(f"Tips shape: {tips.shape}")
print(f"Flights shape: {flights.shape}")
print(f"Titanic shape: {titanic.shape}")
print(f"Iris shape: {iris.shape}")

Datasets loaded:
Tips shape: (244, 7)
Flights shape: (144, 3)
Titanic shape: (891, 15)
Iris shape: (150, 5)


In [19]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [20]:
flights

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121
...,...,...,...
139,1960,Aug,606
140,1960,Sep,508
141,1960,Oct,461
142,1960,Nov,390


In [6]:
titanic 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [7]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


---

In [25]:
max_column = flights.select_dtypes(include=[np.number]).apply(max, axis=0)
max_column

year          1960
passengers     622
dtype: int64

In [7]:
min_column = flights.select_dtypes(include=[np.number]).apply(min, axis=0)
min_column

year          1949
passengers     104
dtype: int64

In [8]:
# Apply single function to all numeric columns
numeric_cols = tips.select_dtypes(include=[np.number])

print("Mean of numeric columns:")
print(numeric_cols.apply(np.mean))


Mean of numeric columns:
total_bill    19.785943
tip            2.998279
size           2.569672
dtype: float64


In [9]:
print("\nStandard deviation of numeric columns:")
print(numeric_cols.apply(np.std))


Standard deviation of numeric columns:
total_bill    8.884151
tip           1.380800
size          0.949149
dtype: float64


In [10]:
# Apply custom function
def coefficient_of_variation(series):
    """Calculate coefficient of variation (CV = std/mean)"""
    return series.std() / series.mean()

print("\nCoefficient of Variation:")
print(numeric_cols.apply(coefficient_of_variation))


Coefficient of Variation:
total_bill    0.449936
tip           0.461478
size          0.370125
dtype: float64


In [11]:
# Apply lambda function
print("\nRange (max - min):")
print(numeric_cols.apply(lambda x: x.max() - x.min()))


Range (max - min):
total_bill    47.74
tip            9.00
size           5.00
dtype: float64


In [8]:
# Multiple functions at once
print("\nMultiple statistics:")
print(numeric_cols.apply(['mean', 'std', 'min', 'max']))


Multiple statistics:
      total_bill        tip      size
mean   19.785943   2.998279  2.569672
std     8.902412   1.383638  0.951100
min     3.070000   1.000000  1.000000
max    50.810000  10.000000  6.000000


In [14]:
titanic.columns.to_list()

['survived',
 'pclass',
 'sex',
 'age',
 'sibsp',
 'parch',
 'fare',
 'embarked',
 'class',
 'who',
 'adult_male',
 'deck',
 'embark_town',
 'alive',
 'alone']

In [12]:
# Apply to each column
column_stats = titanic.apply(lambda x: x.nunique(), axis=0)
print("Unique values per column:")
print(column_stats)


Unique values per column:
survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64
Unique values per column:
survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64


In [10]:
# Multiple functions at once
print("\nMultiple statistics:")
print(numeric_cols.apply(['mean', 'std', 'min', 'max']))


Multiple statistics:
      total_bill        tip      size
mean   19.785943   2.998279  2.569672
std     8.902412   1.383638  0.951100
min     3.070000   1.000000  1.000000
max    50.810000  10.000000  6.000000


### Work Sheet 

In [14]:
file = r'/Users/teslim/TeslimWorkSpace/TheMainData/TheVariousSources/Country GDP-perCapital 1952 - 2007.xlsx'

In [28]:
df = pd.read_excel(file)
df.head()

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.85303
2,Afghanistan,1962,10267083,Asia,31.997,853.10071
3,Afghanistan,1967,11537966,Asia,34.02,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106


In [30]:
def change_column_header(df):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(" ", "_")
    df.columns = df.columns.str.strip()
    return df


# calling the function 
change_column_header(df)

Unnamed: 0,country,year,pop,continent,lifeexp,gdppercap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,1987,9216418,Africa,62.351,706.157306
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623


In [31]:
df.columns.to_list()

['country', 'year', 'pop', 'continent', 'lifeexp', 'gdppercap']

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   year       1704 non-null   int64  
 2   pop        1704 non-null   int64  
 3   continent  1704 non-null   object 
 4   lifeexp    1704 non-null   float64
 5   gdppercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [34]:
numerical_columns = df.select_dtypes(include=['float64', 'int64'])
categorical_columns = df.select_dtypes(exclude=['float64', 'int64'])

In [35]:
numerical_columns

Unnamed: 0,year,pop,lifeexp,gdppercap
0,1952,8425333,28.801,779.445314
1,1957,9240934,30.332,820.853030
2,1962,10267083,31.997,853.100710
3,1967,11537966,34.020,836.197138
4,1972,13079460,36.088,739.981106
...,...,...,...,...
1699,1987,9216418,62.351,706.157306
1700,1992,10704340,60.377,693.420786
1701,1997,11404948,46.809,792.449960
1702,2002,11926563,39.989,672.038623


In [41]:
numerical_columns.apply([max, min, np.mean], axis=0)

Unnamed: 0,year,pop,lifeexp,gdppercap
max,2007.0,1318683000.0,82.603,113523.1329
min,1952.0,60011.0,23.599,241.165876
mean,1979.5,29601210.0,59.474439,7215.327081


In [42]:
numerical_columns.apply(np.std, axis=0)

year         1.726026e+01
pop          1.061267e+08
lifeexp      1.291332e+01
gdppercap    9.854562e+03
dtype: float64

In [43]:
column_stats = categorical_columns.apply(lambda x: x.nunique(), axis=0)
column_stats

country      142
continent      5
dtype: int64

In [45]:
unique = df.apply(lambda x: x.nunique(), axis=0)
unique

country       142
year           12
pop          1704
continent       5
lifeexp      1626
gdppercap    1704
dtype: int64

In [46]:
def coefficient_of_variation(col):
    result = col.std() / col.mean()
    return col.std(), col.mean(), result

In [47]:
numerical_columns.apply(coefficient_of_variation, axis=0).T.head()

Unnamed: 0,0,1,2
year,17.26533,1979.5,0.008722
pop,106157900.0,29601210.0,3.586269
lifeexp,12.91711,59.47444,0.217188
gdppercap,9857.455,7215.327,1.366183


In [48]:
numerical_columns


Unnamed: 0,year,pop,lifeexp,gdppercap
0,1952,8425333,28.801,779.445314
1,1957,9240934,30.332,820.853030
2,1962,10267083,31.997,853.100710
3,1967,11537966,34.020,836.197138
4,1972,13079460,36.088,739.981106
...,...,...,...,...
1699,1987,9216418,62.351,706.157306
1700,1992,10704340,60.377,693.420786
1701,1997,11404948,46.809,792.449960
1702,2002,11926563,39.989,672.038623


In [49]:
def change_columns(col):
    a = col.nlargest(5)
    b = col.nsmallest(6)
    
    return a, b 

In [50]:
ratio  = numerical_columns.apply(change_columns, axis=0)
ratio.T


Unnamed: 0,0,1
year,11 2007 23 2007 35 2007 47 2007 59...,0 1952 12 1952 24 1952 36 1952 48...
pop,299 1318683096 298 1280400000 297 123...,1296 60011 1297 61325 420 63149 1298...
lifeexp,803 82.603 671 82.208 802 82.000 6...,1292 23.599 0 28.801 552 30.000 3...
gdppercap,853 113523.13290 856 109347.86700 852 ...,334 241.165876 335 277.551859 876 298...
