<a href="https://colab.research.google.com/github/Riyasingh22600/ml-fundamentals-assignment/blob/main/ch1-2_notes/ch3_numpy_pandas/ch3_numpy_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 3: NumPy & Pandas

---

# Chapter 3 – NumPy & Pandas

## 3.1 Introduction to NumPy



In [6]:
# --- Install (only in Colab, skip if local Jupyter) ---
# !pip install numpy pandas matplotlib seaborn scikit-learn

# --- Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- Creating arrays ---
arr_1d = np.array([1, 2, 3, 4, 5])
arr_2d = np.array([[1, 2, 3], [4, 5, 6]])

print("1D Array:", arr_1d)
print("2D Array:\n", arr_2d)

# --- Array properties ---
print("Shape of 2D array:", arr_2d.shape)
print("Data type:", arr_1d.dtype)
print("Number of dimensions:", arr_2d.ndim)

# --- Mathematical operations ---
data = np.array([10, 20, 30, 40, 50])
print("Mean:", np.mean(data))
print("Standard deviation:", np.std(data))
print("Sum:", np.sum(data))

# --- Array creation functions ---
zeros = np.zeros((3, 4))
ones = np.ones((2, 3))
random_array = np.random.rand(3, 3)

print("Zeros array:\n", zeros)
print("Ones array:\n", ones)
print("Random array:\n", random_array)

# --- Array indexing and slicing ---
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
print("Elements 2 to 5:", arr[2:6])
print("Every second element:", arr[::2])
print("Elements greater than 5:", arr[arr > 5])

# --- Matrix operations ---
matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[5, 6], [7, 8]])

print("Matrix multiplication:\n", np.dot(matrix_a, matrix_b))
print("Element-wise multiplication:\n", matrix_a * matrix_b)


1D Array: [1 2 3 4 5]
2D Array:
 [[1 2 3]
 [4 5 6]]
Shape of 2D array: (2, 3)
Data type: int64
Number of dimensions: 2
Mean: 30.0
Standard deviation: 14.142135623730951
Sum: 150
Zeros array:
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Ones array:
 [[1. 1. 1.]
 [1. 1. 1.]]
Random array:
 [[0.51994795 0.76099953 0.89021778]
 [0.89879658 0.10933903 0.10570493]
 [0.46751723 0.23140152 0.10510482]]
Elements 2 to 5: [2 3 4 5]
Every second element: [0 2 4 6 8]
Elements greater than 5: [6 7 8 9]
Matrix multiplication:
 [[19 22]
 [43 50]]
Element-wise multiplication:
 [[ 5 12]
 [21 32]]



## 3.1 Introduction to pandas



In [7]:
import pandas as pd
import numpy as np

# Creating DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [25, 30, 35, 28, 32],
    'Salary': [50000, 60000, 70000, 55000, 65000],
    'Department': ['HR', 'IT', 'Finance', 'HR', 'IT']
}
df = pd.DataFrame(data)
print("DataFrame:\n", df)

# Basic DataFrame operations
print("\nDataFrame info:")
print(df.info())
print("\nBasic statistics:")
print(df.describe())

# Data selection and filtering
print("\nNames only:\n", df['Name'])
print("\nEmployees older than 30:\n", df[df['Age'] > 30])
print("\nIT department employees:\n", df[df['Department'] == 'IT'])

# Data manipulation
df['Bonus'] = df['Salary'] * 0.1  # Add new column
df_sorted = df.sort_values('Salary', ascending=False)
print("\nDataFrame with bonus column (sorted by Salary):\n", df_sorted)

# Grouping and aggregation
dept_summary = df.groupby('Department').agg({
    'Age': 'mean',
    'Salary': ['mean', 'max', 'min']
})
print("\nDepartment summary:\n", dept_summary)

# Data cleaning examples
# Handle missing values
df_with_missing = df.copy()
df_with_missing.loc[2, 'Age'] = np.nan
print("\nData with missing values:\n", df_with_missing)
print("\nAfter filling missing values:\n", df_with_missing.fillna(df_with_missing.mean(numeric_only=True)))

# Remove duplicates
df_with_duplicates = pd.concat([df, df.iloc[:2]], ignore_index=True)
print("\nData with duplicates:\n", df_with_duplicates)
print("\nAfter removing duplicates:\n", df_with_duplicates.drop_duplicates())


DataFrame:
       Name  Age  Salary Department
0    Alice   25   50000         HR
1      Bob   30   60000         IT
2  Charlie   35   70000    Finance
3    Diana   28   55000         HR
4      Eve   32   65000         IT

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        5 non-null      object
 1   Age         5 non-null      int64 
 2   Salary      5 non-null      int64 
 3   Department  5 non-null      object
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes
None

Basic statistics:
             Age       Salary
count   5.000000      5.00000
mean   30.000000  60000.00000
std     3.807887   7905.69415
min    25.000000  50000.00000
25%    28.000000  55000.00000
50%    30.000000  60000.00000
75%    32.000000  65000.00000
max    35.000000  70000.00000

Names only:
 0      Alice
1        Bob
2    Charlie
3      Diana
4     

# NumPy and Pandas Integration Example


In [8]:
import numpy as np
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [25, 30, 35, 28, 32],
    'Salary': [50000, 60000, 70000, 55000, 65000],
    'Department': ['HR', 'IT', 'Finance', 'HR', 'IT']
}
df = pd.DataFrame(data)

# Converting between NumPy arrays and Pandas DataFrames
numpy_array = df[['Age', 'Salary']].values
print("NumPy array from DataFrame:\n", numpy_array)

# Using NumPy functions on Pandas data
df['Age_normalized'] = (df['Age'] - df['Age'].mean()) / df['Age'].std()
df['Salary_log'] = np.log(df['Salary'])
print("\nDataFrame with NumPy transformations:\n", df[['Name', 'Age', 'Age_normalized', 'Salary', 'Salary_log']])

# Mathematical operations combining both libraries
correlation_matrix = np.corrcoef(df['Age'], df['Salary'])
print("\nCorrelation between Age and Salary:\n", correlation_matrix)


NumPy array from DataFrame:
 [[   25 50000]
 [   30 60000]
 [   35 70000]
 [   28 55000]
 [   32 65000]]

DataFrame with NumPy transformations:
       Name  Age  Age_normalized  Salary  Salary_log
0    Alice   25       -1.313064   50000   10.819778
1      Bob   30        0.000000   60000   11.002100
2  Charlie   35        1.313064   70000   11.156251
3    Diana   28       -0.525226   55000   10.915088
4      Eve   32        0.525226   65000   11.082143

Correlation between Age and Salary:
 [[1.         0.99654576]
 [0.99654576 1.        ]]
