This set of exercises focuses on using Pandas, a powerful library for data manipulation and analysis in Python. You'll learn to create and manipulate DataFrames, work with real-world datasets, handle missing values, and perform various data operations. The exercises cover key Pandas functionalities including data loading, cleaning, transformation, and basic analysis.

In [2]:
import pandas as pd

import numpy as np
print(np.__version__)


2.2.2


In [None]:
# 1.1
print(1.1)
print("NumPy:")
# Create a NumPy array
data_np = np.array([
    ["Blue", [1, 2], 1.1],
    ["Red", [3, 4], 2.2],
    ["Pink", [5, 6], 3.3],
    ["Grey", [7, 8], 4.4],
    ["Black", [9, 10], 5.5]
], dtype=object)

# Create DataFrame
df_np = pd.DataFrame(
    data_np,
    columns=["color", "list", "number"],
    index=[1, 3, 5, 7, 9]
    )

print("DataFrame from NumPy array:")
print(df_np)

# Create Series
colors = pd.Series(["Blue", "Red", "Pink", "Grey", "Black"], index=[1, 3, 5, 7, 9])
lists = pd.Series([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], index=[1, 3, 5, 7, 9])
numbers = pd.Series([1.1, 2.2, 3.3, 4.4, 5.5], index=[1, 3, 5, 7, 9])

# Create DataFrame
df_series = pd.DataFrame({"color": colors, "list": lists, "number": numbers})

print("\nDataFrame from Pandas Series:")
print(df_series, "\n")

# 1.2
print(1.2)
print("First value types:")
print(type(df_series["color"]))  
print(type(df_series["list"]))  
print(type(df_series["number"])) 

print("\nFirst value types:")
df_np["number"] = df_np["number"].astype(np.float64)
print(type(df_np.iloc[0, 0]), type(df_np.iloc[0, 1]), type(df_np.iloc[0, 2]))



1.1
NumPy:
DataFrame from NumPy array:
   color     list number
1   Blue   [1, 2]    1.1
3    Red   [3, 4]    2.2
5   Pink   [5, 6]    3.3
7   Grey   [7, 8]    4.4
9  Black  [9, 10]    5.5

DataFrame from Pandas Series:
   color     list  number
1   Blue   [1, 2]     1.1
3    Red   [3, 4]     2.2
5   Pink   [5, 6]     3.3
7   Grey   [7, 8]     4.4
9  Black  [9, 10]     5.5 

1.2
irst value types:
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>

First value types:
<class 'str'> <class 'list'> <class 'numpy.float64'>


In [32]:

url = "https://assets.01-edu.org/ai-branch/piscine-ai/household_power_consumption.txt"
# Load the dataset
df = pd.read_csv(url, delimiter=";", low_memory=False)
df["Global_active_power"] = pd.to_numeric(df["Global_active_power"], errors="coerce")
df["Global_reactive_power"] = pd.to_numeric(df["Global_reactive_power"], errors="coerce")
df["Voltage"] = pd.to_numeric(df["Voltage"], errors="coerce")

print(2.1)
# Drop unnecessary columns
df.drop(columns=["Time", "Sub_metering_2", "Sub_metering_3"], inplace=True)
print(2.2)
# Set 'Date' as the index
df = df.set_index("Date")

# Display the first few rows
print(df.head())


# 2.1
print(2.1)



               Date      Time  Global_active_power  Global_reactive_power  \
0        16/12/2006  17:24:00                4.216                  0.418   
1        16/12/2006  17:25:00                5.360                  0.436   
2        16/12/2006  17:26:00                5.374                  0.498   
3        16/12/2006  17:27:00                5.388                  0.502   
4        16/12/2006  17:28:00                3.666                  0.528   
...             ...       ...                  ...                    ...   
2075254  26/11/2010  20:58:00                0.946                  0.000   
2075255  26/11/2010  20:59:00                0.944                  0.000   
2075256  26/11/2010  21:00:00                0.938                  0.000   
2075257  26/11/2010  21:01:00                0.934                  0.000   
2075258  26/11/2010  21:02:00                0.932                  0.000   

         Voltage Global_intensity Sub_metering_1 Sub_metering_2  \
0       