# Broadcasting

Broadcasting simplifies mathemathical operations on array with different shapes

Broadcasting simplifies mathematical operations on arrays with different shapes. It enables NumPy to efficiently apply operations element-wise without explicitly copying or reshaping data.

It automatically adjusts the smaller array to match the shape of the larger array by replicating its values along the necessary dimensions. This feature reduces memory usage and eliminates the need for manual loops making code concise and computationally faster making it essential for handling large datasets and performing complex calculations in python.


In [3]:
import numpy as np

In [9]:
array_1d = np.array([[2,34,54],[1,22,33]])   # shape 2 * 3
array_1d * 10  

# broadcasting [[10,10,10],[10,10,10]]  
# makes this sort of a matrix and then multiplies component wise.


array([[ 20, 340, 540],
       [ 10, 220, 330]])

In [11]:
array_2d = np.array([[1, 2, 3], [4, 5, 6]])  # 2D array
scalar = 10  # Scalar value

result = array_2d + scalar
print(result)

[[11 12 13]
 [14 15 16]]


![image.png](attachment:3b054f43-92c9-44e2-9a0e-868e95786e37.png)

In [15]:
# padding on the left means changing dimension of scalar from () to (1,)

In [17]:
# Broadcasting a 1D array with a 2D array
a1 = np.array([2, 4, 6])
a2 = np.array([[1, 3, 5], [7, 9, 11]])
res = a1 + a2
print(res)

# [[2,4,6], [2,4,6]]  --> a1 is transformed to this matrix

[[ 3  7 11]
 [ 9 13 17]]


In [21]:
import numpy as np
a = np.array([1, 2, 3])
b = np.array([1, 2])
result = a + b  # Raises ValueError

# the shapes are incompatible! 

ValueError: operands could not be broadcast together with shapes (3,) (2,) 

In [5]:
x = np.array([[1],[2],[3]])
y = np.array([4,5,6])

In [7]:
x.shape

(3, 1)

In [17]:
y.shape  # row vector

(3,)

In [19]:
y

array([4, 5, 6])

In [21]:
x

array([[1],
       [2],
       [3]])

In [31]:
b = np.broadcast(x,y)
b  # gives us a an object of broadcast

<numpy.broadcast at 0x183292f7900>

In [35]:
out = np.empty(b.shape)
out

array([[6.23042070e-307, 4.67296746e-307, 1.69121096e-306],
       [4.22787460e-307, 2.67019185e-306, 1.42413555e-306],
       [1.78019082e-306, 1.37959740e-306, 2.29178686e-312]])

In [37]:
out.flat = [u+v for (u,v) in b]
out

array([[5., 6., 7.],
       [6., 7., 8.],
       [7., 8., 9.]])

# list Comprehension

In [44]:
lst = [1,2,3,4]
# lst * 10 wrong
new = []

for a in lst:
    new.append(a * 10)

new

[10, 20, 30, 40]

In [46]:
lst = [1,2,3,4]
# lst * 10 wrong
new = []

for i , a in  enumerate(lst):
    lst[i] = a*10

lst

[10, 20, 30, 40]

In [52]:
[ a*10   for a in range(10) ] # direcly we get a list, values come in a list

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

In [54]:
[ a*10   for a in range(10)  if a%2 == 0]   # agr srf if condition ho without else tu usko loop se phele likho 

[0, 20, 40, 60, 80]

In [58]:
[ a*10  if a%2 == 0 else a*3 for a in range(10)  ] # agr if else condition kai sath ho tu loop se phele likho 

[0, 3, 20, 9, 40, 15, 60, 21, 80, 27]

# Dictionary Comprehension

In [13]:
{a: a**2  for a in range(10)}

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 6: 36, 7: 49, 8: 64, 9: 81}

# Pandas 
   ## Pandas Data Structures
             - Pandas Series : when dealing with single column data 
             - Pandas Dataframe : when data has multiple columns 
             

# Create Series 

In [17]:
import pandas as pd

In [19]:
s1 = pd.Series([12,13,145,53,43,23,65,64,43])
s1  # comes like column with indices. 

0     12
1     13
2    145
3     53
4     43
5     23
6     65
7     64
8     43
dtype: int64

In [21]:
s1.index

RangeIndex(start=0, stop=9, step=1)

In [23]:
s1.values

array([ 12,  13, 145,  53,  43,  23,  65,  64,  43], dtype=int64)

In [25]:
type(s1)

pandas.core.series.Series

In [27]:
s1.dtype

dtype('int64')

## Customize Indices : Labels
basically overwriting index

In [30]:
s1.index = ['Apples','Eggs','Bread','Banana','Mango','Grapes','Peech','Lemon','Tomato']

In [32]:
s1

Apples     12
Eggs       13
Bread     145
Banana     53
Mango      43
Grapes     23
Peech      65
Lemon      64
Tomato     43
dtype: int64

In [33]:
s1.name = "Fruits"

In [36]:
s1

Apples     12
Eggs       13
Bread     145
Banana     53
Mango      43
Grapes     23
Peech      65
Lemon      64
Tomato     43
Name: Fruits, dtype: int64

## Retrieving Data from Series

In [38]:
s1['Grapes'] # indexing

23

In [41]:
s1["Eggs":'Grapes'] # slicing but last index is also included in case of slice by labels

Eggs       13
Bread     145
Banana     53
Mango      43
Grapes     23
Name: Fruits, dtype: int64

In [42]:
s1[1:5]  # here grapes is not incuded. 

Eggs       13
Bread     145
Banana     53
Mango      43
Name: Fruits, dtype: int64

In [45]:
s1.shape  # vector

(9,)

# DataFrame

In [48]:
df = pd.DataFrame({"name":["Ali","Usman","Raza","Shani"]})

In [50]:
df

Unnamed: 0,name
0,Ali
1,Usman
2,Raza
3,Shani


In [51]:
df.shape

(4, 1)

In [53]:
df = pd.DataFrame({"name":["Ali","Usman","Raza","Shani"], 
                   "age" : [18,26,23,42]})

In [56]:
df

Unnamed: 0,name,age
0,Ali,18
1,Usman,26
2,Raza,23
3,Shani,42


In [57]:
df.shape

(4, 2)

## Adding new column from outside

In [61]:
df["Status"] = ["Live","Dead","Live","Live"]

In [62]:
df

Unnamed: 0,name,age,Status
0,Ali,18,Live
1,Usman,26,Dead
2,Raza,23,Live
3,Shani,42,Live


In [63]:
df["Status"] = ["0","0","1","0"]  # overwrites the column

In [65]:
df

Unnamed: 0,name,age,Status
0,Ali,18,0
1,Usman,26,0
2,Raza,23,1
3,Shani,42,0


In [68]:
df["Status"] = ["Live","Dead","Live","Live"]

In [70]:
df["Gender"] = ["Male","Female","Male","Female"]

In [72]:
df

Unnamed: 0,name,age,Status,Gender
0,Ali,18,Live,Male
1,Usman,26,Dead,Female
2,Raza,23,Live,Male
3,Shani,42,Live,Female


## Deleting column

In [76]:
del df["Status"]

In [78]:
df

Unnamed: 0,name,age,Gender
0,Ali,18,Male
1,Usman,26,Female
2,Raza,23,Male
3,Shani,42,Female


# Read Data

In [81]:
data = pd.read_excel("employee.xlsx")

In [82]:
data

Unnamed: 0,Serial,Emp_ID,Designation,Department,Age,Salary
0,1,1101,Manager,Accounts,50.0,200000.0
1,2,1107,Officer,IT,30.0,80000.0
2,3,1203,Officer,HR,28.0,
3,4,1005,Manager,HR,45.0,120000.0
4,5,2123,Office Boy,Accounts,27.0,45000.0
5,6,2451,Accountant,,34.0,100000.0
6,7,1111,Accountant,Accounts,,110000.0
7,8,1001,Officer,IT,25.0,75000.0
8,9,1234,Manager,IT,23.0,
9,10,2156,Engineer,Production,45.0,89000.0


In [83]:
data.shape

(18, 6)

In [84]:
# we have one index column and one serial column therefore we'll convert index column to serial 

In [85]:
data = pd.read_excel("employee.xlsx", index_col = "Serial")

In [86]:
data

Unnamed: 0_level_0,Emp_ID,Designation,Department,Age,Salary
Serial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1101,Manager,Accounts,50.0,200000.0
2,1107,Officer,IT,30.0,80000.0
3,1203,Officer,HR,28.0,
4,1005,Manager,HR,45.0,120000.0
5,2123,Office Boy,Accounts,27.0,45000.0
6,2451,Accountant,,34.0,100000.0
7,1111,Accountant,Accounts,,110000.0
8,1001,Officer,IT,25.0,75000.0
9,1234,Manager,IT,23.0,
10,2156,Engineer,Production,45.0,89000.0


## Accessing a single column from data

In [88]:
data['Designation']

Serial
1        Manager
2        Officer
3        Officer
4        Manager
5     Office Boy
6     Accountant
7     Accountant
8        Officer
9        Manager
10      Engineer
11       Officer
12    Accountant
13      Engineer
14       Officer
15       Officer
16       Officer
17      Engineer
18    Office Boy
Name: Designation, dtype: object

In [89]:
type(data['Designation'])  # type is series

pandas.core.series.Series

## Accessing Multiple Columns

In [92]:
data.columns

Index(['Emp_ID', 'Designation', 'Department', 'Age', 'Salary'], dtype='object')

In [94]:
data["Emp_ID","Salary"] # error because it want list of columns

KeyError: ('Emp_ID', 'Salary')

In [None]:
data[["Emp_ID","Salary"]] # error because it want list of columns

## Generate Data from data

In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18 entries, 1 to 18
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Emp_ID       18 non-null     int64  
 1   Designation  18 non-null     object 
 2   Department   16 non-null     object 
 3   Age          17 non-null     float64
 4   Salary       15 non-null     float64
dtypes: float64(2), int64(1), object(2)
memory usage: 864.0+ bytes


In [110]:
data.shape

(18, 5)

In [112]:
data.head()

Unnamed: 0_level_0,Emp_ID,Designation,Department,Age,Salary
Serial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1101,Manager,Accounts,50.0,200000.0
2,1107,Officer,IT,30.0,80000.0
3,1203,Officer,HR,28.0,
4,1005,Manager,HR,45.0,120000.0
5,2123,Office Boy,Accounts,27.0,45000.0


In [114]:
data.tail()

Unnamed: 0_level_0,Emp_ID,Designation,Department,Age,Salary
Serial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14,1098,Officer,IT,25.0,100000.0
15,2912,Officer,IT,31.0,90000.0
16,1222,Officer,,25.0,
17,2156,Engineer,Production,45.0,89000.0
18,2123,Office Boy,Accounts,27.0,45000.0


In [115]:
data.sample(5)

Unnamed: 0_level_0,Emp_ID,Designation,Department,Age,Salary
Serial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18,2123,Office Boy,Accounts,27.0,45000.0
16,1222,Officer,,25.0,
4,1005,Manager,HR,45.0,120000.0
1,1101,Manager,Accounts,50.0,200000.0
17,2156,Engineer,Production,45.0,89000.0


In [116]:
data.Salary # no gaps in column name so can be accessed like thi as well. 

Serial
1     200000.0
2      80000.0
3          NaN
4     120000.0
5      45000.0
6     100000.0
7     110000.0
8      75000.0
9          NaN
10     89000.0
11    100000.0
12    123000.0
13     89000.0
14    100000.0
15     90000.0
16         NaN
17     89000.0
18     45000.0
Name: Salary, dtype: float64

In [118]:
data.Salary + data.Salary * 0.10   # won't update in data

Serial
1     220000.0
2      88000.0
3          NaN
4     132000.0
5      49500.0
6     110000.0
7     121000.0
8      82500.0
9          NaN
10     97900.0
11    110000.0
12    135300.0
13     97900.0
14    110000.0
15     99000.0
16         NaN
17     97900.0
18     49500.0
Name: Salary, dtype: float64

In [119]:
data['Salary'] + data['Salary'] * 0.10   #

Serial
1     220000.0
2      88000.0
3          NaN
4     132000.0
5      49500.0
6     110000.0
7     121000.0
8      82500.0
9          NaN
10     97900.0
11    110000.0
12    135300.0
13     97900.0
14    110000.0
15     99000.0
16         NaN
17     97900.0
18     49500.0
Name: Salary, dtype: float64

In [120]:
# update in data
data.Salary = data.Salary + data.Salary * 0.10   #

In [123]:
data

Unnamed: 0_level_0,Emp_ID,Designation,Department,Age,Salary
Serial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1101,Manager,Accounts,50.0,220000.0
2,1107,Officer,IT,30.0,88000.0
3,1203,Officer,HR,28.0,
4,1005,Manager,HR,45.0,132000.0
5,2123,Office Boy,Accounts,27.0,49500.0
6,2451,Accountant,,34.0,110000.0
7,1111,Accountant,Accounts,,121000.0
8,1001,Officer,IT,25.0,82500.0
9,1234,Manager,IT,23.0,
10,2156,Engineer,Production,45.0,97900.0


In [124]:
[salary * 0.10 if salary>=200000   else salary * 0.15 if salary>=150000   else salary * 0.05 if salary>=100000   else salary  
                                                                                                                             for salary in data.Salary]

[22000.0,
 88000.0,
 nan,
 6600.0,
 49500.0,
 5500.0,
 6050.0,
 82500.0,
 nan,
 97900.0,
 5500.0,
 6765.0,
 97900.0,
 5500.0,
 99000.0,
 nan,
 97900.0,
 49500.0]