<div align="center">
    <img src="Pic/TUMS-Logo.png" width="220" height="220">
    <br>
    <font color="#0F5298" size="8">
        Introduction to Machine Learning
    </font>
    <br>
    <br>
    <br>
    <font color="orange" size="5">
        Benyamin Ghanbari
    </font>
</div>


# Numpy & Pandas

In [1]:
import numpy as np
import pandas as pd

**Numpy**

In [5]:
# Create a 1D array
a = np.array([1, 2, 3, 4, 5])
print("1D Array:", a)

1D Array: [1 2 3 4 5]


In [6]:
# Create a 2D array
b = np.array([[1, 2, 3], [4, 5, 6]])
print("\n2D Array:")
print(b)


2D Array:
[[1 2 3]
 [4 5 6]]


In [7]:
# Array properties
print("\nArray Shape:", b.shape)
print("Array Size:", b.size)
print("Array Data Type:", b.dtype)



Array Shape: (2, 3)
Array Size: 6
Array Data Type: int32


In [8]:
# Element-wise operations
print("\nAddition:", a + 10)
print("Multiplication:", a * 2)
print("Exponentiation:", np.power(a, 2))



Addition: [11 12 13 14 15]
Multiplication: [ 2  4  6  8 10]
Exponentiation: [ 1  4  9 16 25]


In [9]:
# Reshape array
c = np.arange(12).reshape(3, 4)
print("\nReshaped Array:")
print(c)



Reshaped Array:
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [10]:
# Slicing
print("\nSlice:", c[1, 2])  # Access element at row 1, column 2
print("Row 2:", c[2, :])
print("Column 3:", c[:, 3])


Slice: 6
Row 2: [ 8  9 10 11]
Column 3: [ 3  7 11]


In [11]:
# Boolean indexing
print("\nElements > 5:", c[c > 5])


Elements > 5: [ 6  7  8  9 10 11]


In [12]:
# Aggregation functions
print("\nSum:", c.sum())
print("Mean:", c.mean())
print("Standard Deviation:", c.std())


Sum: 66
Mean: 5.5
Standard Deviation: 3.452052529534663


In [13]:
# Creating special arrays
zeros = np.zeros((2, 3))
print("\nZeros Array:")
print(zeros)


Zeros Array:
[[0. 0. 0.]
 [0. 0. 0.]]


In [14]:
# Creating special arrays
ones = np.ones((3, 3))
print("\nOnes Array:")
print(ones)


Ones Array:
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


In [15]:
identity = np.eye(4)
print("\nIdentity Matrix:")
print(identity)


Identity Matrix:
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [16]:
# Random number generation
rand_array = np.random.rand(2, 2)
print("\nRandom Array:")
print(rand_array)



Random Array:
[[0.95456888 0.19161479]
 [0.78425404 0.25771596]]


In [17]:
rand_ints = np.random.randint(0, 10, size=(3, 3))
print("\nRandom Integers Array:")
print(rand_ints)



Random Integers Array:
[[5 0 0]
 [3 5 9]
 [3 3 4]]


In [18]:
# Linear algebra
matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[5, 6], [7, 8]])

matrix_mult = np.dot(matrix_a, matrix_b)
print("\nMatrix Multiplication:")
print(matrix_mult)

matrix_transpose = matrix_a.T
print("\nTranspose of Matrix A:")
print(matrix_transpose)


Matrix Multiplication:
[[19 22]
 [43 50]]

Transpose of Matrix A:
[[1 3]
 [2 4]]


In [19]:
# Statistical operations
print("\nMin of Array A:", np.min(matrix_a))
print("Max of Array A:", np.max(matrix_a))
print("Median of Array A:", np.median(matrix_a))
print("Variance of Array A:", np.var(matrix_a))



Min of Array A: 1
Max of Array A: 4
Median of Array A: 2.5
Variance of Array A: 1.25


In [20]:
# Broadcasting example
a = np.array([1, 2, 3])
b = np.array([[1], [2], [3]])
print("\nBroadcasted Addition:")
print(a + b)


Broadcasted Addition:
[[2 3 4]
 [3 4 5]
 [4 5 6]]


In [22]:
# Dimension
np.ndim(matrix_a)

2

In [23]:
# Stack arrays vertically
array1 = np.array([1, 2, 3])
array2 = np.array([4, 5, 6])
vstacked = np.vstack((array1, array2))
print("\nVertically Stacked Arrays:")
print(vstacked)

# Stack arrays horizontally
hstacked = np.hstack((array1.reshape(-1, 1), array2.reshape(-1, 1)))
print("\nHorizontally Stacked Arrays:")
print(hstacked)


Vertically Stacked Arrays:
[[1 2 3]
 [4 5 6]]

Horizontally Stacked Arrays:
[[1 4]
 [2 5]
 [3 6]]


In [24]:
# Change datatype of array
float_array = a.astype(float)
print("\nArray with Float Data Type:")
print(float_array)



Array with Float Data Type:
[1. 2. 3.]


In [26]:
# Search 
# Find the indexes where the value is 4:
arr = np.array([1, 2, 3, 4, 5, 4, 4])
x = np.where(arr == 4)
print(x)

(array([3, 5, 6], dtype=int64),)


In [28]:
# Joining NumPy Arrays
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
arr = np.concatenate((arr1, arr2))
print(arr)

[1 2 3 4 5 6]


In [29]:
# Joining NumPy Arrays
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])
arr = np.concatenate((arr1, arr2), axis=1)
print(arr)

[[1 2 5 6]
 [3 4 7 8]]


**Pandas**

In [25]:
# Create a DataFrame from a dictionary
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["New York", "Los Angeles", "Chicago"]
}
df = pd.DataFrame(data)
print("\nDataFrame:")
print(df)


DataFrame:
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [3]:
# Read data from csv file
df = pd.read_csv('data/HeartAttack.csv')
# Show first 10 row
df.head(10)

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result
0,63,1,66,160,83,160.0,1.8,0.012,negative
1,20,1,94,98,46,296.0,6.75,1.06,positive
2,56,1,64,160,77,270.0,1.99,0.003,negative
3,66,1,70,120,55,270.0,13.87,0.122,positive
4,54,1,64,112,65,300.0,1.08,0.003,negative
5,52,0,61,112,58,87.0,1.83,0.004,negative
6,38,0,40,179,68,102.0,0.71,0.003,negative
7,61,1,60,214,82,87.0,300.0,2.37,positive
8,49,0,60,154,81,135.0,2.35,0.004,negative
9,65,1,61,160,95,100.0,2.84,0.011,negative


In [5]:
# Get information from data 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1319 non-null   int64  
 1   Gender                    1319 non-null   int64  
 2   Heart rate                1319 non-null   int64  
 3   Systolic blood pressure   1319 non-null   int64  
 4   Diastolic blood pressure  1319 non-null   int64  
 5   Blood sugar               1319 non-null   float64
 6   CK-MB                     1319 non-null   float64
 7   Troponin                  1319 non-null   float64
 8   Result                    1319 non-null   object 
dtypes: float64(3), int64(5), object(1)
memory usage: 92.9+ KB


In [6]:
# Describe data
df.describe()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,56.193328,0.659591,78.336619,127.170584,72.269143,146.634344,15.274306,0.360942
std,13.638173,0.474027,51.63027,26.12272,14.033924,74.923045,46.327083,1.154568
min,14.0,0.0,20.0,42.0,38.0,35.0,0.321,0.001
25%,47.0,0.0,64.0,110.0,62.0,98.0,1.655,0.006
50%,58.0,1.0,74.0,124.0,72.0,116.0,2.85,0.014
75%,65.0,1.0,85.0,143.0,81.0,169.5,5.805,0.0855
max,103.0,1.0,1111.0,223.0,154.0,541.0,300.0,10.3


In [7]:
# Get columns name
df.columns

Index(['Age', 'Gender', 'Heart rate', 'Systolic blood pressure',
       'Diastolic blood pressure', 'Blood sugar', 'CK-MB', 'Troponin',
       'Result'],
      dtype='object')

In [11]:
# Accessing data
print("\nFirst Row:")
print(df.iloc[0])
print("\nAge Column:")
print(df['Age'])


First Row:
Age                               63
Gender                             1
Heart rate                        66
Systolic blood pressure          160
Diastolic blood pressure          83
Blood sugar                    160.0
CK-MB                            1.8
Troponin                       0.012
Result                      negative
Name: 0, dtype: object

Age Column:
0       63
1       20
2       56
3       66
4       54
        ..
1314    44
1315    66
1316    45
1317    54
1318    51
Name: Age, Length: 1319, dtype: int64


In [15]:
# Add a new column

df['Patient-ID'] = True
print("\nUpdated DataFrame:")
df.head()



Updated DataFrame:


Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result,Patient-ID
0,63,1,66,160,83,160.0,1.8,0.012,negative,True
1,20,1,94,98,46,296.0,6.75,1.06,positive,True
2,56,1,64,160,77,270.0,1.99,0.003,negative,True
3,66,1,70,120,55,270.0,13.87,0.122,positive,True
4,54,1,64,112,65,300.0,1.08,0.003,negative,True


In [16]:
# Filtering data

print("\nPeople older than 60:")
df[df['Age'] > 60]


People older than 60:


Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result,Patient-ID
0,63,1,66,160,83,160.0,1.80,0.012,negative,True
3,66,1,70,120,55,270.0,13.87,0.122,positive,True
7,61,1,60,214,82,87.0,300.00,2.370,positive,True
9,65,1,61,160,95,100.0,2.84,0.011,negative,True
11,63,0,60,150,83,198.0,2.39,0.013,negative,True
...,...,...,...,...,...,...,...,...,...,...
1299,62,1,90,136,68,141.0,1.83,0.014,negative,True
1310,70,0,80,135,75,351.0,2.21,10.000,positive,True
1311,85,1,112,115,69,114.0,2.19,0.062,positive,True
1313,86,0,40,179,68,147.0,5.22,0.011,negative,True


In [18]:
# Value of different group of data

df['Gender'].value_counts()

1    870
0    449
Name: Gender, dtype: int64

In [17]:
# Grouping data

grouped = df.groupby('Gender')['Age'].mean()
print("\nAverage Age by Gender:")
print(grouped)


Average Age by Gender:
Gender
0    57.968820
1    55.277011
Name: Age, dtype: float64


In [19]:
# Handling missing data

missing_data = {
    "Name": ["Alice", "Bob", None],
    "Age": [25, None, 35],
    "City": ["New York", "Los Angeles", None]
}

df_missing = pd.DataFrame(missing_data)
print("\nDataFrame with Missing Values:")

df_missing.head()



DataFrame with Missing Values:


Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,,35.0,


In [21]:
df_missing.isna()

Unnamed: 0,Name,Age,City
0,False,False,False
1,False,True,False
2,True,False,True


In [27]:
# Fill missing values

df_missing_filled = df_missing.fillna("Unknown")
print("\nFilled DataFrame:")
print(df_missing_filled)


Filled DataFrame:
      Name      Age         City
0    Alice     25.0     New York
1      Bob  Unknown  Los Angeles
2  Unknown     35.0      Unknown


In [26]:
# Drop rows with missing values

df_missing_dropped = df_missing.dropna()
print("\nDropped Rows DataFrame:")
print(df_missing_dropped)


Dropped Rows DataFrame:
    Name   Age      City
0  Alice  25.0  New York


In [25]:
# Filling missing values by mode , median or mean 

mean = df_missing["Age"].mean()
df_missing_by_mean = df_missing["Age"].fillna(mean, inplace = False)
# if inplace = True , it would inplace the value in the current df
df_missing_by_mean.head()

0    25.0
1    30.0
2    35.0
Name: Age, dtype: float64

In [28]:
# Drop Nan value by a specific columns
df_missing.dropna(subset=["Name"], inplace=True)

In [29]:
df_missing.head()

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles


In [30]:
# Remove duplicates row in a data 

df.drop_duplicates(inplace = True)