In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("E:/datasets/nba.csv")
df.head()

Unnamed: 0,id,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,1,Avery Bradley,Boston Celtics,0,PG,25,06-Feb,180,Texas,7730337.0
1,2,Jae Crowder,Boston Celtics,99,SF,25,06-Jun,235,Marquette,6796117.0
2,3,John Holland,Boston Celtics,30,SG,27,06-May,205,Boston University,
3,4,R.J. Hunter,Boston Celtics,28,SG,22,06-May,185,Georgia State,1148640.0
4,5,Jonas Jerebko,Boston Celtics,8,PF,29,06-Oct,231,,5000000.0


**____________________________________________________________________________________________________________________________________________________**

**Pandas.apply()**

**1)Pandas.apply allow the users to pass a function and apply it on every single value of the Pandas series.**

**2)It comes as a huge improvement for the pandas library as this function helps to segregate data according to the conditions required due to which it is efficiently used in data science and machine learning.**

**____________________________________________________________________________________________________________________________________________________**


In [5]:
# defining function to check price
def fun(num):
 
    if num<200:
        return "Low"
 
    elif num>= 200 and num<400:
        return "Normal"
 
    else:
        return "High"
 
# passing function to apply and storing returned series in new
new = df["Weight"].apply(fun)
 
# printing first 3 element
print(new.head(3))
 
# printing elements somewhere near the middle of series
print(new[14], new[15], new[16])
 
# printing last 3 elements
print(new.tail(3))

0       Low
1    Normal
2    Normal
Name: Weight, dtype: object
Normal Normal Low
454       Low
455    Normal
456    Normal
Name: Weight, dtype: object


In [4]:
new = df["Number"].apply(lambda num: num+5)
new

0        5
1      104
2       35
3       33
4       13
      ... 
452     46
453     13
454     30
455     26
456     29
Name: Number, Length: 457, dtype: int64

In [7]:
data = {
        'Age': [1, 2, 3],
        'Weight': [4, 5, 6],
        'Salary': [7, 8, 9]}
 
# Convert the dictionary into DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:\n", df)

def add_values(df):
    return df["Age"] + df["Weight"] + df["Salary"] 

df["add_values"] = df.apply(add_values, axis=1)
df["add_values2"] = df.apply(np.sum, axis=1)
print("\nDataFrame after adding values:\n", df)


Original DataFrame:
    Age  Weight  Salary
0    1       4       7
1    2       5       8
2    3       6       9

DataFrame after adding values:
    Age  Weight  Salary  add_values  add_values2
0    1       4       7          12           24
1    2       5       8          15           30
2    3       6       9          18           36


**Normalizing DataFrame Column Values Using Custom Function in Pandas**

In [26]:
import pandas as pd

def normalize(x,y):
    print("(x - np.mean([x, y]):",(x - np.mean([x, y])))
    print("max(x, y) - min(x, y):", (max(x, y) - min(x, y)))
    X_new = ((x - np.mean([x, y])) /(max(x, y) - min(x, y)))

    return X_new

def main():
    # create a dictionary with three fields each
    data = {
        'X': [1, 2, 3],
        'Y': [45, 65, 89]}
 
    # Convert the dictionary into DataFrame
    df = pd.DataFrame(data)
    print("Original DataFrame:\n", df)
    df["X"] = df.apply(lambda row: normalize(row["X"], row["Y"]),axis =1)
    print("|Normalized dataset:\n",df)
    

if __name__ == "__main__":
    main()


Original DataFrame:
    X   Y
0  1  45
1  2  65
2  3  89
(x - np.mean([x, y]): -22.0
max(x, y) - min(x, y): 44
(x - np.mean([x, y]): -31.5
max(x, y) - min(x, y): 63
(x - np.mean([x, y]): -43.0
max(x, y) - min(x, y): 86
|Normalized dataset:
      X   Y
0 -0.5  45
1 -0.5  65
2 -0.5  89


**Applying Range Generation Function to DataFrame Rows in Pandas**

In [31]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

# Function to generate range

def generate_range(n):

	# printing the range for eg:
	# input is 67 output is 60-70
	n = int(n)

	lower_limit = n//10 * 10
	upper_limit = lower_limit + 10

	return str(str(lower_limit) + '-' + str(upper_limit))


def replace(row):
	for i, item in enumerate(row):

		# updating the value of the row
		row[i] = generate_range(item)
	return row

def main():
	# create a dictionary with
	# three fields each
	data = {
		'A': [0, 2, 3],
		'B': [4, 15, 6],
		'C': [47, 8, 19]}

	# Convert the dictionary into DataFrame
	df = pd.DataFrame(data)

	print('Before applying function: ')
	print(df)

	# applying function to each row in
	# dataframe and storing result in a new column
	df = df.apply(lambda row: replace(row))

	print('After Applying Function: ')
	# printing the new dataframe
	print(df)


if __name__ == '__main__':
	main()


Before applying function: 
   A   B   C
0  0   4  47
1  2  15   8
2  3   6  19
After Applying Function: 
      A      B      C
0  0-10   0-10  40-50
1  0-10  10-20   0-10
2  0-10   0-10  10-20


  row[i] = generate_range(item)
  row[i] = generate_range(item)
  row[i] = generate_range(item)


**Pandas Series.apply()**

In [38]:
# importing pandas as pd 
import pandas as pd 
  
# Creating the Series 
sr = pd.Series([11, 21, 8, 18, 65, 18, 32, 10, 5, 32, None]) 
# Create the Index 
# apply yearly frequency 
index_ = pd.date_range('2010-10-09 08:45', periods = 11, freq ='Y')
sr.index=index_
result = sr.apply(lambda x: True if x>30 else False)
result

2010-12-31 08:45:00    False
2011-12-31 08:45:00    False
2012-12-31 08:45:00    False
2013-12-31 08:45:00    False
2014-12-31 08:45:00     True
2015-12-31 08:45:00    False
2016-12-31 08:45:00     True
2017-12-31 08:45:00    False
2018-12-31 08:45:00    False
2019-12-31 08:45:00     True
2020-12-31 08:45:00    False
Freq: A-DEC, dtype: bool

**Pandas aggregate function**

In [39]:
df = pd.read_csv("E:/datasets/nba.csv")
df.head()

Unnamed: 0,id,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,1,Avery Bradley,Boston Celtics,0,PG,25,06-Feb,180,Texas,7730337.0
1,2,Jae Crowder,Boston Celtics,99,SF,25,06-Jun,235,Marquette,6796117.0
2,3,John Holland,Boston Celtics,30,SG,27,06-May,205,Boston University,
3,4,R.J. Hunter,Boston Celtics,28,SG,22,06-May,185,Georgia State,1148640.0
4,5,Jonas Jerebko,Boston Celtics,8,PF,29,06-Oct,231,,5000000.0


In [48]:
df.aggregate({"Number":["sum","min","max"]})

Unnamed: 0,Number
sum,8079
min,0
max,99


**Pandas Series.mad() to calculate Mean Absolute Deviation of a Series**

**![image.png](attachment:75b4cbda-2b15-400b-805e-b472c3cbf08a.png)**

Calculating Mean of series mean = (5+12+1+0+4+22+15+3+9) / 9 = 7.8888


MAD = | (5-7.88)+(12-7.88)+(1-7.88)+(0-7.88)+(4-7.88)+(22-7.88)+(15-7.88)+(3-7.88)+(9-7.88)) | / 9.
00

MAD = (2.88 + 4.12 + 6.88 + 7.88 + 3.88 + 14.12 + 7.12 + 4.88 + 1.12) / 
9.00

MAD = 5.8755 (More accurately = 5.876543209876543)

**Pandas Series.value_counts()**

In [51]:
## importing pandas as pd 
import pandas as pd 
  
# Creating the Series 
sr = pd.Series(['New York', 'Chicago', 'Toronto', 'Lisbon', 'Rio', 'Chicago', 'Lisbon']) 

sr.value_counts()

Chicago     2
Lisbon      2
New York    1
Toronto     1
Rio         1
Name: count, dtype: int64

**Applying Lambda functions to Pandas Dataframe**

In [6]:
keys = ['name', 'age', 'city']
values = ['Alice', 25, 'New York']

combined_dict = dict(zip(keys, values))

print("Combined dictionary:", combined_dict)
# Output: Combined dictionary: {'name': 'Alice', 'age': 25, 'city': 'New York'}


Combined dictionary: {'name': 'Alice', 'age': 25, 'city': 'New York'}


In [7]:
scores = {"Alice": 95, "Bob": 90, "Charlie": 78, "David": 88}
sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))

print("Sorted scores:", sorted_scores)
# Output: Sorted scores: {'Alice': 95, 'Bob': 90, 'David': 88, 'Charlie': 78}


Sorted scores: {'Alice': 95, 'Bob': 90, 'David': 88, 'Charlie': 78}


In [14]:
dict(sorted(scores.items(),key = lambda x: x[1], reverse=True))

{'Alice': 95, 'Bob': 90, 'David': 88, 'Charlie': 78}