- Setup a virtual environment 
```bash
python -m venv env

./env/bin/activate
```

## NUMPY

### Install numpy
```bash
pip install numpy
```

In [9]:
%pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
print(np.__version__)

2.3.3


In [36]:
# Python list example
my_list = [1, 2, 3, 4]
print(my_list)
my_list = my_list * 2        # [1, 2, 3, 4, 1, 2, 3, 4]
print(my_list)

# NumPy array example
array = np.array([1, 2, 3, 4])
print(array)
print(type(array))           # <class 'numpy.ndarray'> , here ndarray means n-dimensional array
array = array * 2            # [2 4 6 8]
print(array)

[1, 2, 3, 4]
[1, 2, 3, 4, 1, 2, 3, 4]
[1 2 3 4]
<class 'numpy.ndarray'>
[2 4 6 8]


## Multidimensional Arrays

In [20]:
# 0D array: Just a single scalar value
array0D = np.array('A')
print("0D array value:", array0D)
print("0D array ndim:", array0D.ndim)    # 0

# 1D array: A row of values
array1D = np.array(['A', 'B', 'C'])
print("1D array:", array1D)
print("1D array ndim:", array1D.ndim)    # 1

# 2D array: Matrix (rows and columns)
array2D = np.array([
    ['A', 'B', 'C'],
    ['D', 'E', 'F'],
    ['G', 'H', 'I']
])
print("2D array:\n", array2D)
print("2D array ndim:", array2D.ndim)
print("2D array shape (rows, columns):", array2D.shape)

# 3D array: Layers, rows, columns (depth)
array3D = np.array([
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G', 'H', 'I']],
    [['J', 'K', 'L'], ['M', 'N', 'O'], ['P', 'Q', 'R']],
    [['S', 'T', 'U'], ['V', 'W', 'X'], ['Y', 'Z', ' ']]
])
print("3D array shape (layers, rows, columns):", array3D.shape)
print("3D array ndim:", array3D.ndim)

# Multidimensional indexing (access specific elements)
print("array3D[0, 0, 0]:", array3D[0, 0, 0])  # First layer, first row, first column
print("array3D[1, 1, 1]:", array3D[1, 1, 1])  # Layer 1, row 1, column 1 (zero-based)

# Forming a three-letter word using indexing and concatenation
word = array3D[0, 0, 1] + array3D[0, 0, 0] + array3D[2, 0, 1]
print("Concatenated word (B, A, T):", word)


0D array value: A
0D array ndim: 0
1D array: ['A' 'B' 'C']
1D array ndim: 1
2D array:
 [['A' 'B' 'C']
 ['D' 'E' 'F']
 ['G' 'H' 'I']]
2D array ndim: 2
2D array shape (rows, columns): (3, 3)
3D array shape (layers, rows, columns): (3, 3, 3)
3D array ndim: 3
array3D[0, 0, 0]: A
array3D[1, 1, 1]: N
Concatenated word (B, A, T): BAT


## Slicing

In [21]:
# Create a 4x4 matrix
array = np.array([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12],
    [13, 14, 15, 16]
])

# Select first row
print("First row:", array[0])

# Select last row using negative index
print("Last row (negative index):", array[-1])

# Row slicing
print("Rows 0 to 2:", array[0:3])
print("Rows 1 to 3:", array[1:4])
print("All rows:", array[:])

# Step - every second row
print("Every 2nd row:", array[::2])

# Reverse rows
print("Rows in reverse:", array[::-1])

# Column selection (all rows, first column)
print("First column (all rows):", array[:, 0])

# Last column (negative index)
print("Last column (all rows):", array[:, -1])

# First 3 columns (all rows)
print("First 3 columns (all rows):", array[:, 0:3])

# Skip first column, show rest
print("All except first column:", array[:, 1:])

# Every 2nd column
print("Every 2nd column:", array[:, ::2])

# Reverse columns
print("Columns in reverse:", array[:, ::-1])

# Quadrant selection (first two rows and first two columns)
print("Top-left 2x2 quadrant:", array[0:2, 0:2])


First row: [1 2 3 4]
Last row (negative index): [13 14 15 16]
Rows 0 to 2: [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
Rows 1 to 3: [[ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]]
All rows: [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]]
Every 2nd row: [[ 1  2  3  4]
 [ 9 10 11 12]]
Rows in reverse: [[13 14 15 16]
 [ 9 10 11 12]
 [ 5  6  7  8]
 [ 1  2  3  4]]
First column (all rows): [ 1  5  9 13]
Last column (all rows): [ 4  8 12 16]
First 3 columns (all rows): [[ 1  2  3]
 [ 5  6  7]
 [ 9 10 11]
 [13 14 15]]
All except first column: [[ 2  3  4]
 [ 6  7  8]
 [10 11 12]
 [14 15 16]]
Every 2nd column: [[ 1  3]
 [ 5  7]
 [ 9 11]
 [13 15]]
Columns in reverse: [[ 4  3  2  1]
 [ 8  7  6  5]
 [12 11 10  9]
 [16 15 14 13]]
Top-left 2x2 quadrant: [[1 2]
 [5 6]]


## Scalar Arithmetic

In [22]:
array = np.array([1, 2, 3])
print("Original:", array)
print("Add 1:", array + 1)
print("Subtract 2:", array - 2)
print("Multiply by 3:", array * 3)
print("Divide by 4:", array / 4)
print("Raise each to power of 5:", array ** 5)


Original: [1 2 3]
Add 1: [2 3 4]
Subtract 2: [-1  0  1]
Multiply by 3: [3 6 9]
Divide by 4: [0.25 0.5  0.75]
Raise each to power of 5: [  1  32 243]


## Vectorized Math Functions

In [23]:
array = np.array([1.01, 2.5, 3.99])
print("Original:", array)
print("Square roots:", np.sqrt(array))
print("Rounded values:", np.round(array))
print("Floored values:", np.floor(array))
print("Ceiling values:", np.ceil(array))
print("Value of pi:", np.pi)

# Area of circles: area = pi * radius^2
radii = np.array([1, 2, 3])
areas = np.pi * radii ** 2
print("Radii:", radii)
print("Areas of circles:", areas)

Original: [1.01 2.5  3.99]
Square roots: [1.00498756 1.58113883 1.99749844]
Rounded values: [1. 2. 4.]
Floored values: [1. 2. 3.]
Ceiling values: [2. 3. 4.]
Value of pi: 3.141592653589793
Radii: [1 2 3]
Areas of circles: [ 3.14159265 12.56637061 28.27433388]


## Elementwise Arithmetic & Comparison

In [24]:
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
print("Array 1:", arr1)
print("Array 2:", arr2)

print("Elementwise addition:", arr1 + arr2)
print("Elementwise subtraction:", arr1 - arr2)
print("Elementwise multiplication:", arr1 * arr2)
print("Elementwise division:", arr1 / arr2)
print("Elementwise power:", arr1 ** arr2)

scores = np.array([91, 55, 100, 73, 82, 64])
print("Scores:", scores)
print("Is score == 100?:", scores == 100)
print("Passing scores (>= 60):", scores >= 60)
print("Failing scores (< 60):", scores < 60)

# Assign zero to failing scores (< 60)
scores[scores < 60] = 0
print("Scores (fails set to zero):", scores)


Array 1: [1 2 3]
Array 2: [4 5 6]
Elementwise addition: [5 7 9]
Elementwise subtraction: [-3 -3 -3]
Elementwise multiplication: [ 4 10 18]
Elementwise division: [0.25 0.4  0.5 ]
Elementwise power: [  1  32 729]
Scores: [ 91  55 100  73  82  64]
Is score == 100?: [False False  True False False False]
Passing scores (>= 60): [ True False  True  True  True  True]
Failing scores (< 60): [False  True False False False False]
Scores (fails set to zero): [ 91   0 100  73  82  64]


## Broadcasting

In [32]:
arr1 = np.array([[1, 2, 3, 4]])         # Shape: (1,4)
arr2 = np.array([[1], [2], [3], [4]])   # Shape: (4,1)

print("arr1 shape:", arr1.shape)
print("arr2 shape:", arr2.shape)
result = arr1 * arr2
print("Result of arr1 * arr2 (broadcasted):\n", result)

# Matrix multiplication
print("Matrix multiplication (dot product):\n", np.matmul(arr1, arr2), np.dot(arr1, arr2))

# Create multiplication table from 1 to 10
arr1 = np.array([[1,2,3,4,5,6,7,8,9,10]])
arr2 = np.array([[1],[2],[3],[4],[5],[6],[7],[8],[9],[10]])
print("Multiplication table (1-10):\n", arr1 * arr2)


arr1 shape: (1, 4)
arr2 shape: (4, 1)
Result of arr1 * arr2 (broadcasted):
 [[ 1  2  3  4]
 [ 2  4  6  8]
 [ 3  6  9 12]
 [ 4  8 12 16]]
Matrix multiplication (dot product):
 [[30]] [[30]]
Multiplication table (1-10):
 [[  1   2   3   4   5   6   7   8   9  10]
 [  2   4   6   8  10  12  14  16  18  20]
 [  3   6   9  12  15  18  21  24  27  30]
 [  4   8  12  16  20  24  28  32  36  40]
 [  5  10  15  20  25  30  35  40  45  50]
 [  6  12  18  24  30  36  42  48  54  60]
 [  7  14  21  28  35  42  49  56  63  70]
 [  8  16  24  32  40  48  56  64  72  80]
 [  9  18  27  36  45  54  63  72  81  90]
 [ 10  20  30  40  50  60  70  80  90 100]]


## Aggregate Functions

In [None]:
array = np.array([
    [1, 2, 3, 4, 5],
    [6, 7, 8, 9, 10]
])

print("Array:\n", array)
print("Sum (all elements):", np.sum(array))
print("Mean:", np.mean(array))
print("Standard deviation:", np.std(array))
print("Variance:", np.var(array))
print("Minimum value:", np.min(array))
print("Maximum value:", np.max(array))
print("Index of min value:", np.argmin(array))
print("Index of max value:", np.argmax(array))

print("Column sums:", np.sum(array, axis=0))
print("Row sums:", np.sum(array, axis=1))


Array:
 [[ 1  2  3  4  5]
 [ 6  7  8  9 10]]
Sum (all elements): 55
Mean: 5.5
Standard deviation: 2.8722813232690143
Variance: 8.25
Minimum value: 1
Maximum value: 10
Index of min value: 0
Index of max value: 9
Column sums: [ 7  9 11 13 15]
Row sums: [15 40]


## Filtering

In [34]:
ages = np.array([
    [21, 17, 19, 20, 16, 30, 18, 65],
    [39, 22, 15, 99, 18, 19, 20, 21]
])

print("Ages array:\n", ages)

teenagers = ages[ages < 18]
print("Teenagers (<18):", teenagers)

# Adults: 18 <= age < 65
adults = ages[(ages >= 18) & (ages < 65)]
print("Adults (18 to <65):", adults)

seniors = ages[ages >= 65]
print("Seniors (65+):", seniors)

evens = ages[ages % 2 == 0]
print("Even ages:", evens)

odds = ages[ages % 2 != 0]
print("Odd ages:", odds)

# Use np.where to keep original shape, others replaced with 0
adults_full_shape = np.where(ages >= 18, ages, 0)
print("Adults (preserving shape, others set to 0):\n", adults_full_shape)


Ages array:
 [[21 17 19 20 16 30 18 65]
 [39 22 15 99 18 19 20 21]]
Teenagers (<18): [17 16 15]
Adults (18 to <65): [21 19 20 30 18 39 22 18 19 20 21]
Seniors (65+): [65 99]
Even ages: [20 16 30 18 22 18 20]
Odd ages: [21 17 19 65 39 15 99 19 21]
Adults (preserving shape, others set to 0):
 [[21  0 19 20  0 30 18 65]
 [39 22  0 99 18 19 20 21]]


## Random Numbers

In [35]:
# Create random number generator
rng = np.random.default_rng()

# Random integer between 1 and 9
print("Random integer (1-9):", rng.integers(1, 10))

# 1D array of 5 random integers between 1 and 9
print("1D array, 5 random integers:", rng.integers(1, 10, size=5))

# 2x4 array of random integers between 1 and 9
print("2x4 array of random integers:\n", rng.integers(1, 10, size=(2,4)))

# Set seed for reproducibility
rng = np.random.default_rng(seed=1)
print("Random float array (size 5, 0-1):", rng.uniform(0, 1, size=5))

# Shuffle an array
arr = np.array([1,2,3,4,5])
print("Original array before shuffle:", arr)
rng.shuffle(arr)
print("Array after shuffle:", arr)

# Pick random values from array (no replacement)
arr = np.array(['🍎', '🍌', '🍓', '🍇', '🍍'])
choice = rng.choice(arr, size=3, replace=False)
print("Random unique emojis chosen:", choice)


Random integer (1-9): 4
1D array, 5 random integers: [5 3 6 9 9]
2x4 array of random integers:
 [[3 7 4 9]
 [7 6 3 9]]
Random float array (size 5, 0-1): [0.51182162 0.9504637  0.14415961 0.94864945 0.31183145]
Original array before shuffle: [1 2 3 4 5]
Array after shuffle: [3 4 5 2 1]
Random unique emojis chosen: ['🍇' '🍎' '🍌']


# PANDAS

In [2]:
# Install Pandas (run this in your shell/terminal, not Python script!)
# pip install pandas

# Import pandas as the common alias 'pd'
import pandas as pd

# Print Pandas version
print("Pandas version:", pd.__version__)


Pandas version: 2.3.3


In [18]:
# Create Python list and convert to Pandas Series (default integer index)
data = [100, 102, 104]
series_obj = pd.Series(data)
print("Default Series with integer index:\n", series_obj)

# Series with floats
series_obj = pd.Series([100.1, 102.3, 104.5])
print("Series with floats:\n", series_obj)

# Series with strings
series_obj = pd.Series(["A", "B", "C"])
print("Series with strings:\n", series_obj)

# Series with booleans
series_obj = pd.Series([True, False, True])
print("Series with booleans:\n", series_obj)

# Series with custom index labels
series_obj = pd.Series([100, 102, 104], index=["A", "B", "C"])
print("Series with custom labels:\n", series_obj)

# Change labels to apartment numbers (analogy)
series_obj = pd.Series([100, 102, 104], index=["apartment 1", "apartment 2", "apartment 3"])
print("Series with apartment labels:\n", series_obj)

# Accessing values by label
print("Value at label 'A':", pd.Series([100, 102, 104], index=["A", "B", "C"]).loc["A"])
# Accessing by position
print("Value at position 0:", pd.Series([100, 102, 104], index=["A", "B", "C"]).iloc[0])

# Updating value by label
series_obj = pd.Series([100, 102, 104], index=["A", "B", "C"])
print("Original Series:\n", series_obj)

series_obj.loc["C"] = 200
print("Series after updating 'C' to 200:\n", series_obj)

# Filter by value: keep values >= 102
filtered = series_obj[series_obj >= 102]
print("Filtered Series (>= 102):\n", filtered)

# Example with more values and custom labels
series_obj = pd.Series([100, 102, 104, 200, 202], index=["A", "B", "C", "D", "E"])
print("Series with 5 elements:\n", series_obj )
print("Values >= 200:\n", series_obj[series_obj >= 200])
print("Values < 200:\n", series_obj[series_obj < 200])

# Creating Series directly from a dictionary
calories = {"day one": 1750, "day two": 2100, "day three": 1700}
calories_series = pd.Series(calories)
print("Calories series:\n", calories_series)

# Access and update example
print("Calories on day three (before):", calories_series.loc["day three"])
calories_series.loc["day three"] += 500
print("Calories on day three (after adding 500):", calories_series.loc["day three"])

# Filtering days with more than 2000 calories
print("Days above 2000 calories:\n", calories_series[calories_series >= 2000])

# Filtering days below 2000 calories
print("Days below 2000 calories:\n", calories_series[calories_series < 2000])


Default Series with integer index:
 0    100
1    102
2    104
dtype: int64
Series with floats:
 0    100.1
1    102.3
2    104.5
dtype: float64
Series with strings:
 0    A
1    B
2    C
dtype: object
Series with booleans:
 0     True
1    False
2     True
dtype: bool
Series with custom labels:
 A    100
B    102
C    104
dtype: int64
Series with apartment labels:
 apartment 1    100
apartment 2    102
apartment 3    104
dtype: int64
Value at label 'A': 100
Value at position 0: 100
Original Series:
 A    100
B    102
C    104
dtype: int64
Series after updating 'C' to 200:
 A    100
B    102
C    200
dtype: int64
Filtered Series (>= 102):
 B    102
C    200
dtype: int64
Series with 5 elements:
 A    100
B    102
C    104
D    200
E    202
dtype: int64
Values >= 200:
 D    200
E    202
dtype: int64
Values < 200:
 A    100
B    102
C    104
dtype: int64
Calories series:
 day one      1750
day two      2100
day three    1700
dtype: int64
Calories on day three (before): 1700
Calories on da

In [19]:
# Create DataFrame from dictionary (columns as keys, lists as column data)
data = {
    "name": ["Spongebob", "Patrick", "Squidward"],
    "age": [30, 35, 50]
}
df = pd.DataFrame(data)
print("Basic employee DataFrame:\n", df)

# DataFrame with a custom index
df = pd.DataFrame(data, index=["employee 1", "employee 2", "employee 3"])
print("Employee DataFrame with custom index:\n", df)

# Access a row by index label (loc)
print("Data for employee 1 (Spongebob):\n", df.loc["employee 1"])

# Access a row by index position (iloc)
print("First row using iloc:\n", df.iloc[0])

# Add a new column
df["job"] = ["cook", "NA", "cashier"]  # NA means not available
print("DataFrame after adding job column:\n", df)

# Add a new row, recommended method: create a DataFrame and concat
new_row = pd.DataFrame(
    [{"name": "Sandy", "age": 28, "job": "engineer"}],
    index=["employee 4"]
)
df = pd.concat([df, new_row])
print("DataFrame after adding new row (Sandy):\n", df)

# Add multiple new rows at once
new_rows = pd.DataFrame(
    [
        {"name": "Eugene", "age": 60, "job": "manager"}
    ],
    index=["employee 5"]
)
df = pd.concat([df, new_rows])
print("DataFrame after adding Eugene:\n", df)


Basic employee DataFrame:
         name  age
0  Spongebob   30
1    Patrick   35
2  Squidward   50
Employee DataFrame with custom index:
                  name  age
employee 1  Spongebob   30
employee 2    Patrick   35
employee 3  Squidward   50
Data for employee 1 (Spongebob):
 name    Spongebob
age            30
Name: employee 1, dtype: object
First row using iloc:
 name    Spongebob
age            30
Name: employee 1, dtype: object
DataFrame after adding job column:
                  name  age      job
employee 1  Spongebob   30     cook
employee 2    Patrick   35       NA
employee 3  Squidward   50  cashier
DataFrame after adding new row (Sandy):
                  name  age       job
employee 1  Spongebob   30      cook
employee 2    Patrick   35        NA
employee 3  Squidward   50   cashier
employee 4      Sandy   28  engineer
DataFrame after adding Eugene:
                  name  age       job
employee 1  Spongebob   30      cook
employee 2    Patrick   35        NA
employee 3  

In [20]:
# Assuming 'data.csv' and 'data.json' are in your working directory
# Reading CSV file into DataFrame
df_csv = pd.read_csv("data.csv")
print("CSV DataFrame (first 5 rows):\n", df_csv.head())

# Print all data (be careful for big files)
print("CSV DataFrame (all rows):\n", df_csv.to_string())

# Reading JSON file into DataFrame
# df_json = pd.read_json("data.json")
# print("JSON DataFrame (first 5 rows):\n", df_json.head())


CSV DataFrame (first 5 rows):
    no        name  type1   type2  height  weight  legendary
0   1   Bulbasaur  Grass  Poison     0.7     6.9          0
1   2     Ivysaur  Grass  Poison     1.0    13.0          0
2   3    Venusaur  Grass  Poison     2.0   100.0          0
3   4  Charmander   Fire     NaN     0.6     8.5          0
4   5  Charmeleon   Fire     NaN     1.1    19.0          0
CSV DataFrame (all rows):
       no        name     type1     type2  height  weight  legendary
0      1   Bulbasaur     Grass    Poison     0.7     6.9          0
1      2     Ivysaur     Grass    Poison     1.0    13.0          0
2      3    Venusaur     Grass    Poison     2.0   100.0          0
3      4  Charmander      Fire       NaN     0.6     8.5          0
4      5  Charmeleon      Fire       NaN     1.1    19.0          0
5      6   Charizard      Fire    Flying     1.7    90.5          0
6      7    Squirtle     Water       NaN     0.5     9.0          0
7      8   Wartortle     Water       N

In [21]:
# Selecting a column (returns a Series)
print("Names column:\n", df_csv["name"])

# Selecting multiple columns
print("Name, height, and weight columns:\n", df_csv[["name", "height", "weight"]])

# Selection by row label (loc)
df_named = pd.read_csv("data.csv", index_col="name")
print("Data for Pikachu:\n", df_named.loc["Pikachu"])

# Selection by row label and columns
print("Charizard height and weight:\n", df_named.loc["Charizard", ["height", "weight"]])

# Select a range of rows (by name)
print("From Charizard to Blastoise:\n", df_named.loc["Charizard":"Blastoise"])

# Selection by row position (iloc) for first 10 rows
print("First 10 rows:\n", df_csv.iloc[0:10])

# Every second row (step)
print("Every second row (first 10):\n", df_csv.iloc[0:10:2])

# Select columns by position
print("First 3 columns for first 10 rows:\n", df_csv.iloc[0:10, 0:3])

# Search by user input with error handling
pokemon = input("Enter a Pokemon name: ")
try:
    print("Stats for", pokemon, ":\n", df_named.loc[pokemon])
except KeyError:
    print(f"{pokemon} not found.")


Names column:
 0       Bulbasaur
1         Ivysaur
2        Venusaur
3      Charmander
4      Charmeleon
          ...    
145       Moltres
146       Dratini
147     Dragonair
148     Dragonite
149        Mewtwo
Name: name, Length: 150, dtype: object
Name, height, and weight columns:
            name  height  weight
0     Bulbasaur     0.7     6.9
1       Ivysaur     1.0    13.0
2      Venusaur     2.0   100.0
3    Charmander     0.6     8.5
4    Charmeleon     1.1    19.0
..          ...     ...     ...
145     Moltres     2.0    60.0
146     Dratini     1.8     3.3
147   Dragonair     4.0    16.5
148   Dragonite     2.2   210.0
149      Mewtwo     2.0   122.0

[150 rows x 3 columns]
Data for Pikachu:
 no                 25
type1        Electric
type2             NaN
height            0.4
weight            6.0
legendary           0
Name: Pikachu, dtype: object
Charizard height and weight:
 height     1.7
weight    90.5
Name: Charizard, dtype: object
From Charizard to Blastoise:
     

In [22]:
# Filter Pokémon with height >= 2m
tall_pokemon = df_csv[df_csv["height"] >= 2]
print("Tall Pokémon (height >= 2):\n", tall_pokemon)

# Filter Pokémon with weight > 100kg
heavy_pokemon = df_csv[df_csv["weight"] > 100]
print("Heavy Pokémon (weight > 100):\n", heavy_pokemon)

# Filter legendary Pokémon (legendary == 1)
legendary_pokemon = df_csv[df_csv["legendary"] == 1]
print("Legendary Pokémon:\n", legendary_pokemon)

# Filter Pokémon where type1 == "water"
water_pokemon = df_csv[df_csv["type1"] == "water"]
print("Water type Pokémon (type1):\n", water_pokemon)

# Filter Pokémon where type1 == "water" or type2 == "water"
water_any = df_csv[(df_csv["type1"] == "water") | (df_csv["type2"] == "water")]
print("Any water type Pokémon:\n", water_any)

# Filter Pokémon that are fire AND flying types
fire_flying = df_csv[(df_csv["type1"] == "fire") & (df_csv["type2"] == "flying")]
print("Fire & Flying Pokémon:\n", fire_flying)


Tall Pokémon (height >= 2):
       no        name    type1    type2  height  weight  legendary
2      3    Venusaur    Grass   Poison     2.0   100.0          0
22    23       Ekans   Poison      NaN     2.0     6.9          0
23    24       Arbok   Poison      NaN     3.5    65.0          0
94    95        Onix     Rock   Ground     8.8   210.0          0
102  103   Exeggutor    Grass  Psychic     2.0   120.0          0
114  115  Kangaskhan   Normal      NaN     2.2    80.0          0
129  130    Gyarados    Water   Flying     6.5   235.0          0
130  131      Lapras    Water      Ice     2.5   220.0          0
142  143     Snorlax   Normal      NaN     2.1   460.0          0
145  146     Moltres     Fire   Flying     2.0    60.0          1
147  148   Dragonair   Dragon      NaN     4.0    16.5          0
148  149   Dragonite   Dragon   Flying     2.2   210.0          0
149  150      Mewtwo  Psychic      NaN     2.0   122.0          1
Heavy Pokémon (weight > 100):
       no       n

In [None]:
# Aggregate on all numeric columns
print("Means (numeric columns only):\n", df_csv.mean(numeric_only=True))
print("Sums (numeric columns only):\n", df_csv.sum(numeric_only=True))
print("Minimums (numeric columns only):\n", df_csv.min(numeric_only=True))
print("Maximums (numeric columns only):\n", df_csv.max(numeric_only=True))
print("Counts (non-null values):\n", df_csv.count())

# Aggregate on single column (height)
print("Height mean:", df_csv["height"].mean())
print("Height sum:", df_csv["height"].sum())
print("Height min:", df_csv["height"].min())
print("Height max:", df_csv["height"].max())
print("Height count:", df_csv["height"].count())
print("Type2 count (shows missing data):", df_csv["type2"].count())

# Grouping by type1 and aggregate height
grouped = df_csv.groupby("type1")
print("Mean height for each type:\n", grouped["height"].mean())
print("Sum of heights for each type:\n", grouped["height"].sum())
print("Min height for each type:\n", grouped["height"].min())
print("Max height for each type:\n", grouped["height"].max())
print("Number of Pokémon for each type:\n", grouped.size())


Means (numeric columns only):
 no           75.500000
height        1.200000
weight       46.231333
legendary     0.026667
dtype: float64
Sums (numeric columns only):
 no           11325.0
height         180.0
weight        6934.7
legendary        4.0
dtype: float64
Minimums (numeric columns only):
 no           1.0
height       0.2
weight       0.1
legendary    0.0
dtype: float64
Maximums (numeric columns only):
 no           150.0
height         8.8
weight       460.0
legendary      1.0
dtype: float64
Counts (non-null values):
 no           150
name         150
type1        150
type2         67
height       150
weight       150
legendary    150
dtype: int64
Height mean: 1.2
Height sum: 180.0
Height min: 0.2
Height max: 8.8
Height count: 150
Type2 count (shows missing data): 67
Mean height for each type:
 type1
Bug         0.900000
Dragon      2.666667
Electric    0.855556
Fairy       0.950000
Fighting    1.185714
Fire        1.216667
Ghost       1.466667
Grass       1.083333
Ground  

In [23]:
# Drop irrelevant columns
df_cleaned = df_csv.drop(columns=["legendary", "no"])
print("DataFrame after dropping columns:\n", df_cleaned.head())

# Drop rows with missing values in column 'type2'
df_no_missing = df_csv.dropna(subset=["type2"])
print("DataFrame after dropping rows with missing type2:\n", df_no_missing.head())

# Replace missing values in 'type2' with 'None'
df_filled = df_csv.copy()
df_filled["type2"] = df_filled["type2"].fillna("None")
print("DataFrame after replacing missing type2 with 'None':\n", df_filled.head())


DataFrame after dropping columns:
          name  type1   type2  height  weight
0   Bulbasaur  Grass  Poison     0.7     6.9
1     Ivysaur  Grass  Poison     1.0    13.0
2    Venusaur  Grass  Poison     2.0   100.0
3  Charmander   Fire     NaN     0.6     8.5
4  Charmeleon   Fire     NaN     1.1    19.0
DataFrame after dropping rows with missing type2:
     no        name  type1   type2  height  weight  legendary
0    1   Bulbasaur  Grass  Poison     0.7     6.9          0
1    2     Ivysaur  Grass  Poison     1.0    13.0          0
2    3    Venusaur  Grass  Poison     2.0   100.0          0
5    6   Charizard   Fire  Flying     1.7    90.5          0
11  12  Butterfree    Bug  Flying     1.1    32.0          0
DataFrame after replacing missing type2 with 'None':
    no        name  type1   type2  height  weight  legendary
0   1   Bulbasaur  Grass  Poison     0.7     6.9          0
1   2     Ivysaur  Grass  Poison     1.0    13.0          0
2   3    Venusaur  Grass  Poison     2.0   1