### Numpy
A library for fast mathematical operations.
Used for:

matrices

ML computations

vectorization (extremely important in ML)

In [1]:
# Creating an Array
import numpy as np

a = np.array([1, 2, 3])
b = np.array([[1, 2, 3], [4, 5, 6]])

print(a)
print(b)

[1 2 3]
[[1 2 3]
 [4 5 6]]


#### Array Attributes

In [2]:
a.shape

(3,)

In [3]:
b.shape

(2, 3)

In [4]:
a.ndim

1

In [5]:
b.ndim

2

In [6]:
a.dtype

dtype('int64')

In [7]:
b.dtype

dtype('int64')

In [8]:
a.size

3

In [9]:
b.size

6

#### Array Operations 

In [10]:
a

array([1, 2, 3])

In [11]:
a + 2

array([3, 4, 5])

In [12]:
a * 2

array([2, 4, 6])

In [13]:
a ** 2

array([1, 4, 9])

In [14]:
a + a

array([2, 4, 6])

In [15]:
b

array([[1, 2, 3],
       [4, 5, 6]])

In [16]:
b + 2

array([[3, 4, 5],
       [6, 7, 8]])

In [17]:
b * 2

array([[ 2,  4,  6],
       [ 8, 10, 12]])

In [18]:
b ** 2

array([[ 1,  4,  9],
       [16, 25, 36]])

In [19]:
b + b

array([[ 2,  4,  6],
       [ 8, 10, 12]])

### Array Indexing and Slicing

In [20]:
a = np.array([10,20,30,40])
print(a)

[10 20 30 40]


In [21]:
a[1]        # 20

20

In [22]:
a[:2]       # [10,20]

array([10, 20])

In [23]:
a[1:3]      # [20,30]

array([20, 30])

### Useful Methods

In [24]:
np.zeros(2)

array([0., 0.])

In [25]:
np.zeros((2,3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [26]:
np.ones(3)

array([1., 1., 1.])

In [27]:
np.ones((3,3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [28]:
np.arange(0,10,2)

array([0, 2, 4, 6, 8])

In [29]:
np.linspace(1,10,10)

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

### Matrix Operations

In [30]:
A = np.array([[1,2],[3,4]])
B = np.array([[5,6],[7,8]])

In [31]:
A.dot(B)

array([[19, 22],
       [43, 50]])

In [32]:
A.T

array([[1, 3],
       [2, 4]])

In [33]:
np.linalg.inv(A)

array([[-2. ,  1. ],
       [ 1.5, -0.5]])

# Numpy Exercises

Task 1:
Create an array of numbers from 1 to 20.

In [34]:
np.array(np.arange(1,21))

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])

Task 2:
Create a 3×3 matrix of random integers.

In [35]:
np.array([[1,2,3],[2,4,5],[4,5,7]])

array([[1, 2, 3],
       [2, 4, 5],
       [4, 5, 7]])

Task 3:
Given:

arr = np.array([10,20,30,40,50])


Extract first 3 elements

Extract last 2 elements

Extract 20 and 40

In [36]:
arr = np.array([10,20,30,40,50])
arr[:3]

array([10, 20, 30])

In [37]:
arr[-2:]

array([40, 50])

In [38]:
arr[1::2]

array([20, 40])

Task 4:
Compute:

mean

median

standard deviation

In [39]:
np.mean(arr)

30.0

In [40]:
np.median(arr)

30.0

In [41]:
np.std(arr)

14.142135623730951

Task 5:
Create a matrix and compute:

transpose

dot product

In [42]:
mat = np.array([[1,2,3],[4,5,6],[7,8,9]])
mat

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [43]:
mat.T

array([[1, 4, 7],
       [2, 5, 8],
       [3, 6, 9]])

In [44]:
mat.dot(mat)

array([[ 30,  36,  42],
       [ 66,  81,  96],
       [102, 126, 150]])

# Pandas Basics

What is Pandas?

Used for:

tables

data cleaning

EDA

grouping

merging datasets

handling missing data

In [45]:
#DataFrame Creation
import pandas as pd

data = {
    "Name": ["A", "B", "C"],
    "Age": [23, 25, 31],
    "City": ["NY", "LA", "TX"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,A,23,NY
1,B,25,LA
2,C,31,TX


In [46]:
#Loading the data
data = pd.read_csv('file.csv')

In [47]:
data.head()

Unnamed: 0,Name,Age,Salary,City
0,A,24,55000,NY
1,B,30,62000,LA
2,C,22,48000,TX
3,D,35,72000,NY
4,E,28,60000,LA


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   Salary  5 non-null      int64 
 3   City    5 non-null      object
dtypes: int64(2), object(2)
memory usage: 288.0+ bytes


In [49]:
data.describe()

Unnamed: 0,Age,Salary
count,5.0,5.0
mean,27.8,59400.0
std,5.118594,8876.936408
min,22.0,48000.0
25%,24.0,55000.0
50%,28.0,60000.0
75%,30.0,62000.0
max,35.0,72000.0


In [50]:
#Selecting Columns
data['Age']

0    24
1    30
2    22
3    35
4    28
Name: Age, dtype: int64

In [51]:
data[['Age','Salary']]

Unnamed: 0,Age,Salary
0,24,55000
1,30,62000
2,22,48000
3,35,72000
4,28,60000


In [52]:
#Filtering Rows
data[data['Age'] > 30]

Unnamed: 0,Name,Age,Salary,City
3,D,35,72000,NY


In [53]:
data[(data['Age'] > 30) & (data['Salary'] > 50000)]

Unnamed: 0,Name,Age,Salary,City
3,D,35,72000,NY


In [54]:
#Adding & Removing Columns
data['Age_2'] = data['Age'] * 2

In [55]:
data.head()

Unnamed: 0,Name,Age,Salary,City,Age_2
0,A,24,55000,NY,48
1,B,30,62000,LA,60
2,C,22,48000,TX,44
3,D,35,72000,NY,70
4,E,28,60000,LA,56


In [56]:
data.drop('Age_2',axis=1)

Unnamed: 0,Name,Age,Salary,City
0,A,24,55000,NY
1,B,30,62000,LA
2,C,22,48000,TX
3,D,35,72000,NY
4,E,28,60000,LA


In [57]:
#Handling Missing Values
data.isnull().sum()

Name      0
Age       0
Salary    0
City      0
Age_2     0
dtype: int64

In [58]:
data.fillna(data.mean())

Unnamed: 0,Name,Age,Salary,City,Age_2
0,A,24,55000,NY,48
1,B,30,62000,LA,60
2,C,22,48000,TX,44
3,D,35,72000,NY,70
4,E,28,60000,LA,56


In [59]:
data.dropna()

Unnamed: 0,Name,Age,Salary,City,Age_2
0,A,24,55000,NY,48
1,B,30,62000,LA,60
2,C,22,48000,TX,44
3,D,35,72000,NY,70
4,E,28,60000,LA,56


In [60]:
#Groupby
data.groupby('City')['Age'].mean()

City
LA    29.0
NY    29.5
TX    22.0
Name: Age, dtype: float64

In [61]:
data.groupby('City').agg({'Age':['mean','max','min']}).T

Unnamed: 0,City,LA,NY,TX
Age,mean,29.0,29.5,22.0
Age,max,30.0,35.0,22.0
Age,min,28.0,24.0,22.0


In [62]:
#Sorting
data.sort_values('Age',ascending=False)

Unnamed: 0,Name,Age,Salary,City,Age_2
3,D,35,72000,NY,70
1,B,30,62000,LA,60
4,E,28,60000,LA,56
0,A,24,55000,NY,48
2,C,22,48000,TX,44


In [63]:
#Merging
pd.merge(data,data,on='Name',how='inner')

Unnamed: 0,Name,Age_x,Salary_x,City_x,Age_2_x,Age_y,Salary_y,City_y,Age_2_y
0,A,24,55000,NY,48,24,55000,NY,48
1,B,30,62000,LA,60,30,62000,LA,60
2,C,22,48000,TX,44,22,48000,TX,44
3,D,35,72000,NY,70,35,72000,NY,70
4,E,28,60000,LA,56,28,60000,LA,56


### Pandas-Tasks

Task 1:
Create a DataFrame with columns:

Product

Price

Category

Add 5 rows.

In [64]:
data1 = {
    "Product":['P1', 'P2', 'P3', 'P4', 'P5'],
    "Price":[30, 35, 15, 12, 20],
    "Category":['Induction', 'Dryer', 'Mixer', 'Induction', 'Dryer']
}

df = pd.DataFrame(data1)
df.head()

Unnamed: 0,Product,Price,Category
0,P1,30,Induction
1,P2,35,Dryer
2,P3,15,Mixer
3,P4,12,Induction
4,P5,20,Dryer


Task 2:
Filter all products whose price > $20.

In [65]:
df[df['Price'] > 20]

Unnamed: 0,Product,Price,Category
0,P1,30,Induction
1,P2,35,Dryer


Task 3:
Group by Category → find average price.

In [66]:
df.groupby('Category')['Price'].mean()

Category
Dryer        27.5
Induction    21.0
Mixer        15.0
Name: Price, dtype: float64

Task 4:
Add a new column “Discounted Price” with 10% discount.

In [67]:
df['Discounted Price'] = df['Price'] - df['Price'] * .10

In [68]:
df

Unnamed: 0,Product,Price,Category,Discounted Price
0,P1,30,Induction,27.0
1,P2,35,Dryer,31.5
2,P3,15,Mixer,13.5
3,P4,12,Induction,10.8
4,P5,20,Dryer,18.0


Task 5:
Create two small DataFrames and practice merge (inner + left).

In [69]:
data2 = {
    "Product":['P6', 'P7', 'P8', 'P9', 'P10'],
    "Price":[70, 45, 25, 22, 15],
    "Category":['Induction', 'Dryer', 'Mixer', 'Induction', 'Dryer']
}

df2 = pd.DataFrame(data2)
df2.head()

Unnamed: 0,Product,Price,Category
0,P6,70,Induction
1,P7,45,Dryer
2,P8,25,Mixer
3,P9,22,Induction
4,P10,15,Dryer


In [70]:
pd.merge(df,df2,on='Category',how='inner')

Unnamed: 0,Product_x,Price_x,Category,Discounted Price,Product_y,Price_y
0,P1,30,Induction,27.0,P6,70
1,P1,30,Induction,27.0,P9,22
2,P4,12,Induction,10.8,P6,70
3,P4,12,Induction,10.8,P9,22
4,P2,35,Dryer,31.5,P7,45
5,P2,35,Dryer,31.5,P10,15
6,P5,20,Dryer,18.0,P7,45
7,P5,20,Dryer,18.0,P10,15
8,P3,15,Mixer,13.5,P8,25


In [71]:
pd.merge(df,df2,on='Category',how='left')

Unnamed: 0,Product_x,Price_x,Category,Discounted Price,Product_y,Price_y
0,P1,30,Induction,27.0,P6,70
1,P1,30,Induction,27.0,P9,22
2,P2,35,Dryer,31.5,P7,45
3,P2,35,Dryer,31.5,P10,15
4,P3,15,Mixer,13.5,P8,25
5,P4,12,Induction,10.8,P6,70
6,P4,12,Induction,10.8,P9,22
7,P5,20,Dryer,18.0,P7,45
8,P5,20,Dryer,18.0,P10,15


# Mini EDA Project

In [81]:
data = data.drop('Age_2',axis=1)

Perform:

head, info, describe

max salary

min age

avg salary by city

filter salary > 60000

sort by age

add column: salary_in_lakhs = salary / 100000

In [82]:
data.head()

Unnamed: 0,Name,Age,Salary,City
0,A,24,55000,NY
1,B,30,62000,LA
2,C,22,48000,TX
3,D,35,72000,NY
4,E,28,60000,LA


In [83]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   Salary  5 non-null      int64 
 3   City    5 non-null      object
dtypes: int64(2), object(2)
memory usage: 288.0+ bytes


In [84]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,5.0,27.8,5.118594,22.0,24.0,28.0,30.0,35.0
Salary,5.0,59400.0,8876.936408,48000.0,55000.0,60000.0,62000.0,72000.0


In [88]:
data['Salary'].max()

72000

In [89]:
data['Age'].min()

22

In [91]:
data.groupby('City')['Salary'].mean()

City
LA    61000
NY    63500
TX    48000
Name: Salary, dtype: int64

In [92]:
data[data['Salary'] > 60000]

Unnamed: 0,Name,Age,Salary,City
1,B,30,62000,LA
3,D,35,72000,NY


In [94]:
data.sort_values('Age')

Unnamed: 0,Name,Age,Salary,City
2,C,22,48000,TX
0,A,24,55000,NY
4,E,28,60000,LA
1,B,30,62000,LA
3,D,35,72000,NY


In [95]:
data['salary_in_lakhs'] = data['Salary'] / 100000
data.head()

Unnamed: 0,Name,Age,Salary,City,salary_in_lakhs
0,A,24,55000,NY,0.55
1,B,30,62000,LA,0.62
2,C,22,48000,TX,0.48
3,D,35,72000,NY,0.72
4,E,28,60000,LA,0.6
