<a href="https://colab.research.google.com/github/RakshithSuresh/A-B-testing-projects/blob/main/Numpy_Learnings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PHASE 1 — Foundations (Core Building Blocks)

In [1]:
import numpy as np

print(np.__version__)

2.0.2


In [2]:
# 🔹 Creating a Simple Array
import numpy as np

data = [10, 20, 30, 40]
arr = np.array(data)

print(arr)          # [10 20 30 40]
print(type(arr))    # <class 'numpy.ndarray'>

[10 20 30 40]
<class 'numpy.ndarray'>


In [3]:
import numpy as np
import time

# Create large dataset
numbers = list(range(1_000_000))
array = np.array(numbers)

# Python list
start = time.time()
[x * 2 for x in numbers]
print("Python list time:", time.time() - start)

# NumPy array
start = time.time()
array * 2
print("NumPy array time:", time.time() - start)


Python list time: 0.0774080753326416
NumPy array time: 0.006215810775756836


In [4]:
# converting list to array
data = [1,2,3,4]
arr = np.array(data)
arr

array([1, 2, 3, 4])

In [5]:
# multiply by 3
print(arr*3)

[ 3  6  9 12]


In [6]:
# Create a NumPy array from [2, 4, 6, 8] and compute its square.
arr = np.array([2,4,6,8])
print(arr*arr)

[ 4 16 36 64]


In [7]:
# Given arr = np.array([3, 6, 9]), double every element without using a loop.
arr = np.array([3,6,9])
print(arr*2)

[ 6 12 18]


In [8]:
# You’re given a list of weekly deliveries completed:
# deliveries = [120, 135, 128, 150, 160]

# Tasks:

# Convert this list into a NumPy array.

# Compute the total deliveries, average per week, and week-over-week growth using np.diff().

# (Bonus) Calculate the % growth between each week.

deliveries = [120, 135, 128, 150, 160]
arr = np.array(deliveries)
print(arr)
print()
print(np.sum(deliveries))
print()
print(np.mean(deliveries))
print()
print(np.diff(deliveries))
print()
print(np.diff(deliveries)/deliveries[:-1]*100)

[120 135 128 150 160]

693

138.6

[15 -7 22 10]

[12.5        -5.18518519 17.1875      6.66666667]


In [9]:
# Topic 2: Creating Arrays

# 🧱 1. Creating Arrays from Python Lists

# The simplest way — converting an existing list or list of lists.

import numpy as np

arr1 = np.array([10, 20, 30])              # 1D array
arr2 = np.array([[1, 2, 3], [4, 5, 6]])    # 2D array

print(arr1)
print(arr2)


[10 20 30]
[[1 2 3]
 [4 5 6]]


In [10]:
# 🧮 2. Creating Arrays Using Range Functions : To simulate time intervals, sample data, or create ID ranges automatically.
# 🔹 np.arange(start, stop, step)

# Like Python’s range(), but returns a NumPy array.

print(np.arange(0, 10, 2))


[0 2 4 6 8]


In [11]:
# 🔹 np.linspace(start, stop, num): Generates evenly spaced values — very useful for sampling or charting.

print(np.linspace(0, 1, 5))
# basically it will divide 1 into 5 parts and seperate from 0


[0.   0.25 0.5  0.75 1.  ]


In [12]:
# 🧊 3. Creating Arrays with Constant Values : To initialize placeholder data, set baseline metrics, or fill missing values.
# 🔹 All zeros

np.zeros((2, 3))


array([[0., 0., 0.],
       [0., 0., 0.]])

In [13]:
# 🔹 All ones

np.ones((3, 2))


array([[1., 1.],
       [1., 1.],
       [1., 1.]])

In [14]:
# 🔹 Custom fill value

np.full((2, 3), 7)

array([[7, 7, 7],
       [7, 7, 7]])

In [15]:
# 🔢 4. Identity and Diagonal Arrays : Useful in linear algebra (correlation matrices, covariance, etc.).
# 🔹 Identity matrix (square)

np.eye(4)


array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [16]:
# Diagonal matrix

np.diag([1, 2, 3])


array([[1, 0, 0],
       [0, 2, 0],
       [0, 0, 3]])

In [17]:
# Create a 1D array from 5 to 25 (inclusive) with a step of 5.
np.arange(5,26,5)

array([ 5, 10, 15, 20, 25])

In [18]:
# Create a 3×3 array of random floats between 0 and 1 and compute its mean.
np.random.rand(3,3)

array([[0.90109844, 0.89532523, 0.57326562],
       [0.4374499 , 0.64807359, 0.04370566],
       [0.30905405, 0.71748652, 0.60834101]])

In [19]:
# create a 10*10 array of integer values between 0 and 100 and compute its mean
testing = np.random.randint(0,100,(10,10))
print(testing)
print()
print(np.mean(testing))

[[22 46 86 34 89 98 22 46 30 62]
 [95 84 46 59 16 89 20  7  4 70]
 [66 43 26  6 70 97  3 71 34 79]
 [18 27 88 85 87 27 59 83 45 12]
 [88 20 29 37 10 30 21 63 59 16]
 [54 85 36 92 54 14 34  3  3 56]
 [41  1 18  4 56 73 76 23 76 77]
 [86 40 36 15 42 42 40 94 89 34]
 [65 77 15 36 19 17 93 34 71 49]
 [48 40 32  2  1 83  0 65 12 90]]

46.67


In [20]:
# Generate a 5×5 diagonal matrix with diagonal values [10, 20, 30, 40, 50].
np.diag([10,20,30,40,50])

array([[10,  0,  0,  0,  0],
       [ 0, 20,  0,  0,  0],
       [ 0,  0, 30,  0,  0],
       [ 0,  0,  0, 40,  0],
       [ 0,  0,  0,  0, 50]])

In [21]:
# mini challenge
days = np.array([1, 2, 3, 4, 5, 6, 7])
np.random.seed(40) # to generate same random numbers
orders = np.random.randint(50,100,size = 7) # generating 7 random order sizes for all the 7 days
print(days)
print(orders)
total_orders = np.sum(orders)
print(f"total orders in the week: {total_orders}")
print()


avgPerDay = np.mean(orders)
print(f"avg order in a day: {avgPerDay}")
print()

dayWithMaxOrders = np.argmax(orders) + 1
print(f"day with max orders: {dayWithMaxOrders}")
print()


[1 2 3 4 5 6 7]
[56 77 57 87 51 62 57]
total orders in the week: 447

avg order in a day: 63.857142857142854

day with max orders: 4



In [22]:
arr_3d = np.array([
    [[1, 2, 3],
     [4, 5, 6]],

    [[7, 8, 9],
     [10, 11, 12]]
])

arr_3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

# Reshaping Array

In [23]:
arr = np.array([1,2,3,4,5,6,7,8,9,10,11,12])
print(f"shape of array is {arr.shape}")
print()
print(f"dimension of array is {arr.ndim}")
print()
print(arr)

shape of array is (12,)

dimension of array is 1

[ 1  2  3  4  5  6  7  8  9 10 11 12]


In [24]:
# reshaping the above 1d array to 2d array
arr_2d = arr.reshape(4,3)
print(arr_2d)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [25]:
# you can also use -1 to reshape dynamically like below,
arr.reshape(4,-1)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [26]:
# reshaping the above 1d array to 3d array
arr_3d = arr.reshape(3,1,4)

# 3 → means 3 blocks
# 1 → means each block has 1 row
# 4 → means each row has 4 columns

print(arr_3d)

[[[ 1  2  3  4]]

 [[ 5  6  7  8]]

 [[ 9 10 11 12]]]


In [27]:
# flatening 3d to 2d and then to 1d
arr_3d_to_2d = arr_3d.reshape(-1,4)
print(arr_3d_to_2d)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


In [28]:
# flatening 3d to 1d using flatten()
# flatten creates a copy of the array and its slow and memory consuming and the original array value doesn't change if you change the flattened values
flattened = arr_3d.flatten()
print(flattened)
print()
flattened[0] = 10
print(flattened)
print()
print(arr_3d)

[ 1  2  3  4  5  6  7  8  9 10 11 12]

[10  2  3  4  5  6  7  8  9 10 11 12]

[[[ 1  2  3  4]]

 [[ 5  6  7  8]]

 [[ 9 10 11 12]]]


In [29]:
# using ravel()

raveled = arr_3d.ravel()
print(raveled)
print()
raveled[0] = 10000
print(raveled)
print()
print(arr_3d)

# see the difference

[ 1  2  3  4  5  6  7  8  9 10 11 12]

[10000     2     3     4     5     6     7     8     9    10    11    12]

[[[10000     2     3     4]]

 [[    5     6     7     8]]

 [[    9    10    11    12]]]


# Array Indexing & Slicing in NumPy

In [30]:
# syntax- array[start : stop : step]

# working with 1D array
arr = np.array([1,2,3,4,5,6,7,8,9,10,11,12])

# printing array
print(arr)

# printing 2nd element
print(f"second element in the array is {arr[2]}")

# printing first 3 elements
print(f"first 3 elements in the array are {arr[:3]}")

# printing last 3 elements
print(f"last 3 elements in the array are {arr[-3:]}")

# printing Middle range
print(f"middle elements are {arr[3:8]}")

# printing every other element
print(f"every other element is {arr[::2]}")


[ 1  2  3  4  5  6  7  8  9 10 11 12]
second element in the array is 3
first 3 elements in the array are [1 2 3]
last 3 elements in the array are [10 11 12]
middle elements are [4 5 6 7 8]
every other element is [ 1  3  5  7  9 11]


**working with 2d array**

In [31]:

arr2d = np.array([
    [10, 20, 30],
    [40, 50, 60],
    [70, 80, 90]
])

# printing 2d array
print(arr2d)
print()

# accesing single element: 60
print(arr2d[1,2])
print()

# Access entire row
print(arr2d[2]) # last row
print()

# Access entire column
print(arr2d[:,0]) #first column
print()

# Slice multiple rows & columns, get 50,60,80,90
print(arr2d[1:3,1:3])
print()

# Non-contiguous rows/columns
print(arr2d[[0, 2], [1, 2]])

[[10 20 30]
 [40 50 60]
 [70 80 90]]

60

[70 80 90]

[10 40 70]

[[50 60]
 [80 90]]

[20 90]


**Slicing in 3D Arrays**

In [32]:
arr3d = np.array([
  [[1, 2, 3], [4, 5, 6]],
  [[7, 8, 9], [10, 11, 12]]
])

print(arr3d)
print()

print(arr3d.shape)
print()

# Access single block
print(arr3d[0])
print()

# Access single value(9)
print(arr3d[1,0,2])
print()

# Slice across blocks
print(arr3d[:,1,:])

print()

# 🎯 4️⃣ Boolean (Conditional) Slicing

# You can filter values using conditions — just like SQL WHERE.

arr = np.array([10, 15, 20, 25, 30])
arr[arr > 20]

print()

# ⚙️ 6️⃣ Mixing Slicing and Indexing
arr2d = np.array([
  [10, 20, 30, 40],
  [50, 60, 70, 80],
  [90, 100, 110, 120]
])

arr2d[0:2, [1,3]]




[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]]

(2, 2, 3)

[[1 2 3]
 [4 5 6]]

9

[[ 4  5  6]
 [10 11 12]]




array([[20, 40],
       [60, 80]])

# 🧩 Topic 4: Boolean Masking & Conditional Filtering

Boolean masking means using conditions (like >, <, ==) to filter or modify parts of an array.


In [37]:
arr = np.array([10, 20, 30, 40, 50])

mask = arr > 25


print(mask)
print()
arr[mask]


[False False  True  True  True]



array([30, 40, 50])

| Operation          | Example                   | Result                           |                                 |
| ------------------ | ------------------------- | -------------------------------- | ------------------------------- |
| Greater than       | `arr > 25`                | `[False False True True True]`   |                                 |
| Less than          | `arr < 40`                | `[True True True False False]`   |                                 |
| Equal to           | `arr == 30`               | `[False False True False False]` |                                 |
| Not equal          | `arr != 10`               | `[False True True True True]`    |                                 |
| Combine conditions | `(arr > 20) & (arr < 50)` | `[False False True True False]`  |                                 |
| OR condition       | `(arr < 15)               | (arr > 45)`                      | `[True False False False True]` |


In [38]:
# 🧠 Filtering with Boolean Masks
arr = np.array([100, 120, 80, 150, 60])
filtered = arr[arr >= 100]
print(filtered)

[100 120 150]


In [39]:
# 🧩 Replacing Values with Conditions

# You can modify only the values that meet a condition.

arr = np.array([5, 10, 15, 20, 25])
arr[arr < 10] = 0
print(arr)

[ 0 10 15 20 25]


In [40]:
# ⚙️ Using np.where() (Ternary Logic)

# np.where(condition, value_if_true, value_if_false)

arr = np.array([100, 200, 300, 400])
result = np.where(arr > 250, "High", "Low")
print(result)


['Low' 'Low' 'High' 'High']


In [41]:
# 🧠 Combining Multiple Conditions
arr = np.array([100, 200, 300, 400, 500])
arr[(arr > 150) & (arr < 450)]

array([200, 300, 400])

In [57]:
# Create an array [12, 5, 8, 18, 22, 7]

arr = np.array([12, 5, 8, 18, 22, 7])
print(arr)
print()

# Extract all numbers > 10
print(arr[arr>10])
print()


# Using np.where(), label numbers as "Even" or "Odd".
print(np.where(arr%2 == 0, 'Even','Odd'))
print()

# Replace numbers < 10 with 0
arr[arr<10] = 0
print(arr)
print()


# For a 2D array of shape (3,4) with random integers (1–100), extract all values > 50.
arr2d = np.random.randint(1,100,(3,4))
print(arr2d[arr2d>50])
print()

# Combine two conditions: keep values between 20 and 80.
# syntax is usually array[(1st condition) & (2nd condition)]
print(arr2d[(arr2d>20) & (arr2d<80)])


[12  5  8 18 22  7]

[12 18 22]

['Even' 'Odd' 'Even' 'Even' 'Even' 'Odd']

[12  0  0 18 22  0]

[79 72 57 78 87 85 83 92 91 74]

[79 72 57 78 74 28]


# 🧩 Handling Missing or Invalid Data in NumPy

**📘 Concept Explanation**

🔹 Missing / Invalid Data in NumPy

In NumPy, missing or invalid data is usually represented as:

| Type     | Representation        | Meaning                                       |
| -------- | --------------------- | --------------------------------------------- |
| **NaN**  | `np.nan`              | Not a Number (missing numeric value)          |
| **Inf**  | `np.inf` or `-np.inf` | Infinity (overflow or division by zero)       |
| **None** | Python’s null object  | Only works in object arrays, not numeric ones |


NumPy treats np.nan as a float type by default — even if the data was int.

In [63]:
# 💻 1️⃣ Detecting Missing Values
import numpy as np

arr = np.array([10, np.nan, 20, np.nan, 30])
print(arr)

print()

# To detect which values are NaN:

print(np.isnan(arr))
print()
print(arr[np.isnan(arr)])

[10. nan 20. nan 30.]

[False  True False  True False]

[nan nan]


In [61]:
# 💡 2️⃣ Filtering or Counting Missing Values

# Filter only non-missing values:

arr[~np.isnan(arr)]

array([10., 20., 30.])

In [76]:
# 🧩 3️⃣ Replacing Missing Values

# Replace all NaNs with a specific number:

arr = np.array([10, np.nan, 20, np.nan, 30])
arr[np.isnan(arr)] = 0
print(arr)


print()


# replacing with mean
arr = np.array([10, np.nan, 20, np.nan, 30])
arr[np.isnan(arr)] = np.mean(arr[~np.isnan(arr)])
print(arr)

[10.  0. 20.  0. 30.]

[10. 20. 20. 20. 30.]


In [77]:
# 💡 Using np.nan_to_num()

# Quick way to replace NaN and Inf in one go:

arr = np.array([10, np.nan, np.inf, -np.inf])
np.nan_to_num(arr, nan=0, posinf=999, neginf=-999)

array([  10.,    0.,  999., -999.])

# 🧠 4️⃣ Ignoring NaN in Calculations

Regular NumPy functions (np.mean, np.sum, etc.) break if NaN is present.

Using the nan-safe functions

| Regular Function | NaN-Safe Version | Description            |
| ---------------- | ---------------- | ---------------------- |
| `np.sum()`       | `np.nansum()`    | Ignores NaN in sum     |
| `np.mean()`      | `np.nanmean()`   | Ignores NaN in mean    |
| `np.std()`       | `np.nanstd()`    | Ignores NaN in std dev |
| `np.min()`       | `np.nanmin()`    | Ignores NaN in min     |
| `np.max()`       | `np.nanmax()`    | Ignores NaN in max     |


In [90]:
arr = np.array([10, np.nan, 20, np.nan, 30])

print(np.sum(arr))
print(np.nansum(arr))

print()

print(np.mean(arr))
print(np.nanmean(arr))

print()

print(np.std(arr))
print(np.nanstd(arr))

print()


print(np.min(arr))
print(np.nanmin(arr))

print()

print(np.max(arr))
print(np.nanmax(arr))


nan
60.0

nan
20.0

nan
8.16496580927726

nan
10.0

nan
30.0


In [91]:
# ⚙️ 5️⃣ Handling Infinite Values (inf and -inf)

# Infinite values occur in:

# Division by zero

# Log of 0

# Overflow from large computations

# Example:

arr = np.array([1, 0, -2])
result = 10 / arr
print(result)



# ✅ Replace infinities:

result[np.isinf(result)] = np.nan
print(result)



# ✅ Then handle NaNs using the same methods as before.

[10. inf -5.]
[10. nan -5.]


  result = 10 / arr


# Mathematical & Statistical Functions

| Operation         | Code                    | Output |
| ----------------- | ----------------------- | ------ |
| Sum               | `arr.sum()`             | `150`  |
| Mean              | `arr.mean()`            | `30.0` |
| Min               | `arr.min()`             | `10`   |
| Max               | `arr.max()`             | `50`   |
| Range (Max - Min) | `arr.max() - arr.min()` | `40`   |


In [101]:
import numpy as np
arr = np.array([10, 20, 30, 40, 50])
print(arr)

print()

print(arr.sum())
print()

print(arr.min())
print()

print(arr.max())
print()

print(arr.max() - arr.min()) # range

[10 20 30 40 50]

150

10

50

40


# 🧮 2️⃣ Statistical Functions

| Function          | Description        | Example                         |
| ----------------- | ------------------ | ------------------------------- |
| `np.mean()`       | Average            | `np.mean(arr)` → 30.0           |
| `np.median()`     | Middle value       | `np.median(arr)` → 30.0         |
| `np.std()`        | Standard deviation | `np.std(arr)` → 14.14           |
| `np.var()`        | Variance           | `np.var(arr)` → 200.0           |
| `np.percentile()` | Percentile value   | `np.percentile(arr, 90)` → 46.0 |


In [105]:
# 🧠 4️⃣ Correlation and Covariance

# Correlation = relationship between two variables.

x = np.array([10, 20, 30, 40, 50])
y = np.array([2, 4, 6, 8, 10])

print(np.corrcoef(x, y))
print()

# Covariance:

print(np.cov(x, y))

[[1. 1.]
 [1. 1.]]

[[250.  50.]
 [ 50.  10.]]


# 💡 5️⃣ Cumulative Operations

| Function       | Description        | Example              |
| -------------- | ------------------ | -------------------- |
| `np.cumsum()`  | Cumulative sum     | `[10 30 60 100 150]` |
| `np.cumprod()` | Cumulative product | `[10 200 6000 ...]`  |


In [106]:
arr = np.array([10, 20, 30, 40])
print(np.cumsum(arr))
print(np.cumprod(arr))

[ 10  30  60 100]
[    10    200   6000 240000]


# ⚖️ 6️⃣ Rounding and Precision
| Function     | Description         | Example                         |
| ------------ | ------------------- | ------------------------------- |
| `np.round()` | Round to n decimals | `np.round(3.14159, 2)` → `3.14` |
| `np.floor()` | Round down          | `np.floor(3.9)` → `3.0`         |
| `np.ceil()`  | Round up            | `np.ceil(3.1)` → `4.0`          |


In [108]:
# Real Analyst Example: SLA Metrics

sla_times = np.array([28, 35, 42, 31, 39, 48, 30, 33, 29])
avg = np.mean(sla_times)
std_dev = np.std(sla_times)
within_sla = np.sum(sla_times <= 40)
breach_rate = (len(sla_times) - within_sla) / len(sla_times) * 100

print(f"Average Delivery: {avg:.2f}")
print(f"Std Deviation: {std_dev:.2f}")
print(f"SLA Breach Rate: {breach_rate:.2f}%")


Average Delivery: 35.00
Std Deviation: 6.36
SLA Breach Rate: 22.22%


# ⚙️ 3️⃣ Axis Parameter (Row vs Column Calculations)

When working with 2D arrays, axis determines the direction of the calculation.


| Function     | Code                    | Output          | Meaning          |
| ------------ | ----------------------- | --------------- | ---------------- |
| Column means | `np.mean(data, axis=0)` | `[40. 50. 60.]` | Down each column |
| Row means    | `np.mean(data, axis=1)` | `[20. 50. 80.]` | Across each row  |
| Column sum   | `np.sum(data, axis=0)`  | `[120 150 180]` | Total per column |
| Row sum      | `np.sum(data, axis=1)`  | `[60 150 240]`  | Total per row    |


| Operation              | axis=0         | axis=1      |
| ---------------------- | -------------- | ----------- |
| Works **down** columns | ✅              | ❌           |
| Works **across** rows  | ❌              | ✅           |
| Collapses rows         | ✅              | ❌           |
| Collapses columns      | ❌              | ✅           |
| Typical use            | Column summary | Row summary |



In [113]:
data = np.array([
  [10, 20, 30],
  [40, 50, 60],
  [70, 80, 90]
])


print(data)
print()

# row collapse will be done by axis=0
print(np.mean(data, axis=0))
print()

# column collapse will be done by axis=1
print(np.mean(data, axis=1))
print()



[[10 20 30]
 [40 50 60]
 [70 80 90]]

[40. 50. 60.]

[20. 50. 80.]



# Combining & Splitting Arrays


In analytics, you often need to:

Combine datasets (merge rows or columns)

Split large arrays into smaller chunks for separate analysis

NumPy gives you clean, high-speed functions to do this — no loops or manual slicing.

In [118]:
# 💻 1️⃣ Combining Arrays (Merging / Joining)
# 🔹 A. np.concatenate() → General-purpose join

# You can join arrays along an axis (row-wise or column-wise).

import numpy as np

a = np.array([[1, 2],
              [3, 4]])

b = np.array([[5, 6],
              [7, 8]])

print(a)
print()
print(b)
print()

# ➤ Combine vertically (add rows)
print(np.concatenate((a, b), axis=0))
print()

# ➤ Combine horizontally (add columns)
print(np.concatenate((a, b), axis=1))

# ✅ Axis=0 → stack down (rows)
# ✅ Axis=1 → stack across (columns)

[[1 2]
 [3 4]]

[[5 6]
 [7 8]]

[[1 2]
 [3 4]
 [5 6]
 [7 8]]

[[1 2 5 6]
 [3 4 7 8]]


In [124]:
# B. np.vstack() → Vertical Stack (Rows)

# Shortcut for stacking row-wise (axis=0):

print(np.vstack((a, b)))
print()

# np.hstack() → Horizontal Stack (Columns)

# Shortcut for stacking column-wise (axis=1):

print(np.hstack((a, b)))

[[1 2]
 [3 4]
 [5 6]
 [7 8]]

[[1 2 5 6]
 [3 4 7 8]]


In [132]:
# 💻 2️⃣ Splitting Arrays (Breaking Apart)
# 🔹 A. np.split() → Split along an axis
arr = np.array([
  [10, 20, 30, 40],
  [50, 60, 70, 80]
])
print(arr)
print()

# ➤ Split into two equal column parts
print(np.split(arr, 2, axis=1))
print()

# 🔹 B. Unequal splits with indices
print(np.split(arr, [1, 3], axis=1))
print()


# Horizontal split (by columns)
print(np.split(arr, 2, axis=1))
print()

# Vertical split (by rows)
print(np.split(arr, 2, axis=0))
print()

[[10 20 30 40]
 [50 60 70 80]]

[array([[10, 20],
       [50, 60]]), array([[30, 40],
       [70, 80]])]

[array([[10],
       [50]]), array([[20, 30],
       [60, 70]]), array([[40],
       [80]])]

[array([[10, 20],
       [50, 60]]), array([[30, 40],
       [70, 80]])]

[array([[10, 20, 30, 40]]), array([[50, 60, 70, 80]])]



# Broadcasting & Vectorization

It’s a top interview topic too, especially when they ask:

“Why is NumPy faster than Python lists?”

“What is broadcasting in NumPy?”

##🔹 What is Vectorization?

Vectorization means performing operations on entire arrays instead of looping through elements one by one.



In [134]:
import numpy as np
arr = np.array([1, 2, 3])
result = arr * 2
print(arr)
print()
print(result)

# 🧠 NumPy internally performs this using vectorized C operations — it never loops in Python.
# That’s why it’s 50–100x faster.

[1 2 3]

[2 4 6]


In [136]:
# Basic Example: Scalar Broadcasting
arr = np.array([10, 20, 30])
result = arr + 5
print(result)# ✅ The scalar 5 is broadcasted to all elements of the array.

[15 25 35]


In [141]:
# 2️⃣ Broadcasting with 1D and 2D Arrays
a = np.array([[1, 2, 3],
              [4, 5, 6]])
b = np.array([10, 20, 30])
print(a)
print()
print(b)
print()

print(a + b)
print()
print(a*b)

# ✅ NumPy stretches b (1×3) to match a (2×3):

[[1 2 3]
 [4 5 6]]

[10 20 30]

[[11 22 33]
 [14 25 36]]

[[ 10  40  90]
 [ 40 100 180]]


In [145]:
# Column Broadcasting (Vertical Expansion)
a = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])
b = np.array([[10],
              [20],
              [30]])

print(a)
print()
print(b)
print()

print(a + b)
print()
print(a*b)


[[1 2 3]
 [4 5 6]
 [7 8 9]]

[[10]
 [20]
 [30]]

[[11 12 13]
 [24 25 26]
 [37 38 39]]

[[ 10  20  30]
 [ 80 100 120]
 [210 240 270]]


# 💡 4️⃣ Rules of Broadcasting

When operating on two arrays, NumPy compares their shapes from right to left.

Two dimensions are compatible when:

They are equal, OR

One of them is 1

If not compatible → ❌ ValueError: operands could not be broadcast together

| A Shape   | B Shape   | Works? | Explanation                          |
| --------- | --------- | ------ | ------------------------------------ |
| (3, )     | (3, )     | ✅      | Same size                            |
| (3, )     | (1, )     | ✅      | 1 can broadcast                      |
| (2, 3)    | (3, )     | ✅      | 3 matches, 1 missing axis broadcasts |
| (3, 2)    | (2, 3)    | ❌      | Mismatched dimensions                |
| (4, 1, 3) | (1, 5, 3) | ✅      | 1s broadcast to 4 and 5              |


# ⚔️ Vectorization vs Broadcasting — The Real Difference
| Concept                 | **Vectorization**                                                              | **Broadcasting**                                                           |
| ----------------------- | ------------------------------------------------------------------------------ | -------------------------------------------------------------------------- |
| **What it means**       | Performing operations on *entire arrays* instead of looping element-by-element | Making arrays of *different shapes* compatible for element-wise operations |
| **Purpose**             | Speed — replace loops with optimized array math                                | Flexibility — allow operations between mismatched shapes                   |
| **When it happens**     | When both arrays are already the same shape                                    | When NumPy has to stretch (broadcast) one array to match another           |
| **Example (simple)**    | `arr * 2` — every element multiplied by 2                                      | `matrix + [1, 2, 3]` — 1D array stretched across rows                      |
| **Needs same shape?**   | ✅ Yes (same shape)                                                             | ❌ No (NumPy auto-expands)                                                  |
| **Keyword to remember** | “No loops”                                                                     | “Shape matching”                                                           |
| **Analyst use case**    | Normalization, scaling, KPI math                                               | Adding a row/column average to full dataset                                |


# File Input/Output

🟢 Loading data from a file (CSV, TXT, etc.) → into NumPy arrays

🔵 Saving arrays back to files → for reuse or sharing

NumPy supports both:

Text-based files → .txt, .csv (human-readable)

Binary files → .npy, .npz (fast and compact for internal use)

In [147]:
# 💻 1️⃣ Saving and Loading Binary Files

# This is the fastest and most accurate way to store arrays.

# 🔹 Save a single array
import numpy as np

arr = np.array([[10, 20, 30],
                [40, 50, 60]])

np.save('my_array.npy', arr)

# 🔹 Load it back
loaded = np.load('my_array.npy')
print(loaded)

[[10 20 30]
 [40 50 60]]


In [149]:
# 🔹 Save multiple arrays together
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
np.savez('multi_arrays.npz', first=a, second=b)

# Load it back
data = np.load('multi_arrays.npz')
print(data['first'])
print(data['second'])

# ✅ np.savez() stores multiple arrays in one zipped binary file.

[1 2 3]
[4 5 6]


In [151]:
# 💻 2️⃣ Working with Text Files (CSV or TXT)
# 🔹 Save as a text file
arr = np.array([[10, 20, 30],
                [40, 50, 60]])

np.savetxt('data.csv', arr, delimiter=',', fmt='%d') # ✅ Creates a readable .csv

# 🔹 Load a text file
loaded = np.loadtxt('data.csv', delimiter=',')
print(loaded) #✅ Returns float by default (even if original was int).

[[10. 20. 30.]
 [40. 50. 60.]]


In [152]:
# ⚙️ 3️⃣ Loading Complex CSVs with Missing or Mixed Data

# For text data that’s not perfectly numeric, use np.genfromtxt() — it’s smarter than loadtxt().

arr = np.genfromtxt('data.csv', delimiter=',', filling_values=0)


# ✅ Automatically replaces missing values with 0.

# 🧩 Topic 10: Integration with Pandas

NumPy = low-level numeric engine

Pandas = high-level data manipulation tool (built on top of NumPy)

That means:
➡️ Pandas uses NumPy arrays internally to store data.

➡️ Every Pandas DataFrame or Series is essentially a labeled NumPy array.

In [155]:
# 💻 1️⃣ Converting Between NumPy Arrays and Pandas
# 🔹 A. NumPy → Pandas

# You can create a Pandas DataFrame directly from a NumPy array.

import numpy as np
import pandas as pd

arr = np.array([[10, 20, 30],
                [40, 50, 60],
                [70, 80, 90]])

df = pd.DataFrame(arr, columns=['Q1', 'Q2', 'Q3'])
print(df)
print()
# 🔹 B. Pandas → NumPy

# Any DataFrame or Series can be converted back to a NumPy array using .to_numpy() or .values.

arr2 = df.to_numpy()
print(arr2)
print()

# ⚙️ 2️⃣ Selecting Columns as NumPy Arrays

# You can pull columns from a DataFrame as NumPy arrays.

df['Q1'].to_numpy()
# Output: array([10, 40, 70])

   Q1  Q2  Q3
0  10  20  30
1  40  50  60
2  70  80  90

[[10 20 30]
 [40 50 60]
 [70 80 90]]


# 💡 Common NumPy functions used on Pandas:

| Function     | Example                                  | Description                   |
| ------------ | ---------------------------------------- | ----------------------------- |
| `np.mean()`  | `np.mean(df)`                            | Column averages               |
| `np.sum()`   | `np.sum(df, axis=1)`                     | Row totals                    |
| `np.std()`   | `np.std(df)`                             | Standard deviation per column |
| `np.where()` | `np.where(df['Q1'] > 50, 'High', 'Low')` | Conditional labeling          |
| `np.isnan()` | `df[np.isnan(df['Q2'])]`                 | Detect NaN rows               |


In [165]:
# 🧠 3️⃣ Using NumPy Functions Directly on Pandas
# Because Pandas is built on NumPy, most NumPy functions work directly on DataFrames or Series.
print(df)
print()

print(np.mean(df))
print()

print(np.sum(df))
print()

print(df.sum())
print()

print(np.min(df))
print()

print(df.min())


   Q1  Q2  Q3
0  10  20  30
1  40  50  60
2  70  80  90

50.0

Q1    120
Q2    150
Q3    180
dtype: int64

Q1    120
Q2    150
Q3    180
dtype: int64

10

Q1    10
Q2    20
Q3    30
dtype: int64


  return reduction(axis=axis, out=out, **passkwargs)


# PRACTICE

In [208]:
import pandas as pd
import numpy as np

data = {
    'Region': ['North', 'South', 'East', 'West', 'Central'],
    'Q1_Sales': [120, 85, 90, 150, 110],
    'Q2_Sales': [130, 95, np.nan, 160, 115],
    'Q3_Sales': [140, 100, 120, 170, np.nan],
    'Target': [125, 90, 100, 160, 110]
}

df = pd.DataFrame(data)
print(df)


    Region  Q1_Sales  Q2_Sales  Q3_Sales  Target
0    North       120     130.0     140.0     125
1    South        85      95.0     100.0      90
2     East        90       NaN     120.0     100
3     West       150     160.0     170.0     160
4  Central       110     115.0       NaN     110


In [209]:
# 👉 Calculate the average sales per quarter (Q1, Q2, Q3) ignoring missing values.

print(np.nanmean(df[['Q1_Sales']]))
print(np.nanmean(df[['Q2_Sales']]))
print(np.nanmean(df[['Q3_Sales']]))

111.0
125.0
132.5


In [210]:
# Create a new column "Status" using np.where() such that:

# "Met Target" if average quarterly sales ≥ Target

# "Missed Target" otherwise.

# step 1: calculate quarterly avg

avg_quarterly = np.array(df[['Q1_Sales','Q2_Sales','Q3_Sales']].mean(axis=1))

# step 2: creating new column status
df['status'] = np.where(avg_quarterly >= df['Target'],"Met Target","Missed Target")

df

Unnamed: 0,Region,Q1_Sales,Q2_Sales,Q3_Sales,Target,status
0,North,120,130.0,140.0,125,Met Target
1,South,85,95.0,100.0,90,Met Target
2,East,90,,120.0,100,Met Target
3,West,150,160.0,170.0,160,Met Target
4,Central,110,115.0,,110,Met Target


In [211]:
# ⚙️ Question 3 (Intermediate): Replace Missing Values

# 👉 Replace missing quarterly sales (NaN) with the mean of that specific quarter using NumPy.

# arr[np.isnan(arr)] = np.mean(arr[~np.isnan(arr)])

df['Q1_Sales'].fillna(np.nanmean(df['Q1_Sales']),inplace = True)
df['Q2_Sales'].fillna(np.nanmean(df['Q2_Sales']),inplace = True)
df['Q3_Sales'].fillna(np.nanmean(df['Q3_Sales']),inplace = True)


df


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Q1_Sales'].fillna(np.nanmean(df['Q1_Sales']),inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Q2_Sales'].fillna(np.nanmean(df['Q2_Sales']),inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

Unnamed: 0,Region,Q1_Sales,Q2_Sales,Q3_Sales,Target,status
0,North,120,130.0,140.0,125,Met Target
1,South,85,95.0,100.0,90,Met Target
2,East,90,125.0,120.0,100,Met Target
3,West,150,160.0,170.0,160,Met Target
4,Central,110,115.0,132.5,110,Met Target
