This set of exercises focuses on using Pandas, a powerful library for data manipulation and analysis in Python. You'll learn to create and manipulate DataFrames, work with real-world datasets, handle missing values, and perform various data operations. The exercises cover key Pandas functionalities including data loading, cleaning, transformation, and basic analysis.

In [54]:
import pandas as pd

import numpy as np
print(np.__version__)


2.2.2


In [55]:
# 1.1
print(1.1)
print("NumPy:")
# Create a NumPy array
data_np = np.array([
    ["Blue", [1, 2], 1.1],
    ["Red", [3, 4], 2.2],
    ["Pink", [5, 6], 3.3],
    ["Grey", [7, 8], 4.4],
    ["Black", [9, 10], 5.5]
], dtype=object)

# Create DataFrame
df_np = pd.DataFrame(
    data_np,
    columns=["color", "list", "number"],
    index=[1, 3, 5, 7, 9]
    )

print("DataFrame from NumPy array:")
print(df_np)

# Create Series
colors = pd.Series(["Blue", "Red", "Pink", "Grey", "Black"], index=[1, 3, 5, 7, 9])
lists = pd.Series([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], index=[1, 3, 5, 7, 9])
numbers = pd.Series([1.1, 2.2, 3.3, 4.4, 5.5], index=[1, 3, 5, 7, 9])

# Create DataFrame
df_series = pd.DataFrame({"color": colors, "list": lists, "number": numbers})

print("\nDataFrame from Pandas Series:")
print(df_series, "\n")

# 1.2
print(1.2)
print("First value types:")
print(type(df_series["color"]))  
print(type(df_series["list"]))  
print(type(df_series["number"])) 

print("\nFirst value types:")
df_np["number"] = df_np["number"].astype(np.float64)
print(type(df_np.iloc[0, 0]), type(df_np.iloc[0, 1]), type(df_np.iloc[0, 2]))



1.1
NumPy:
DataFrame from NumPy array:
   color     list number
1   Blue   [1, 2]    1.1
3    Red   [3, 4]    2.2
5   Pink   [5, 6]    3.3
7   Grey   [7, 8]    4.4
9  Black  [9, 10]    5.5

DataFrame from Pandas Series:
   color     list  number
1   Blue   [1, 2]     1.1
3    Red   [3, 4]     2.2
5   Pink   [5, 6]     3.3
7   Grey   [7, 8]     4.4
9  Black  [9, 10]     5.5 

1.2
First value types:
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>

First value types:
<class 'str'> <class 'list'> <class 'numpy.float64'>


In [None]:

# Exercise 2: Electric power consumption

url = "https://assets.01-edu.org/ai-branch/piscine-ai/household_power_consumption.txt"
# Load the dataset
df = pd.read_csv(url, delimiter=";", low_memory=False)
df["Global_active_power"] = pd.to_numeric(df["Global_active_power"], errors="coerce")
df["Global_reactive_power"] = pd.to_numeric(df["Global_reactive_power"], errors="coerce")
df["Voltage"] = pd.to_numeric(df["Voltage"], errors="coerce")




# Delete the columns Time, Sub_metering_2 and Sub_metering_3
print(2.1)
df.drop(columns=["Time", "Sub_metering_2", "Sub_metering_3"], inplace=True)
print("The axis parameter defaults to axis=1 for columns, so there's no need to explicitly specify it.")

# Set Date as index
print("\n2.2")
# Convert 'Date' to datetime
df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y")
# Set 'Date' as the index
df.set_index("Date", inplace=True)
print(df.head().index)



2.1
The axis parameter defaults to axis=1 for columns, so there's no need to explicitly specify it.

2.2
DatetimeIndex(['2006-12-16', '2006-12-16', '2006-12-16', '2006-12-16',
               '2006-12-16'],
              dtype='datetime64[ns]', name='Date', freq=None)


In [59]:
# Create a function that takes as input the DataFrame with the data set and returns a DataFrame with updated types:
print(2.3)

def update_types(df):
    # Convert specific columns to their correct types
    df["Global_active_power"] = pd.to_numeric(df["Global_active_power"], errors="coerce")
    df["Global_reactive_power"] = pd.to_numeric(df["Global_reactive_power"], errors="coerce")
    df["Global_intensity"] = pd.to_numeric(df["Global_intensity"], errors="coerce")
    df["Sub_metering_1"] = pd.to_numeric(df["Sub_metering_1"], errors="coerce")
    
    return df

df = update_types(df)
print(df.dtypes)

3
Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
dtype: object


In [58]:
# Use describe to have an overview on the data set
print(2.4)

print(df.describe())


4
       Global_active_power  Global_reactive_power       Voltage  \
count         2.049280e+06           2.049280e+06  2.049280e+06   
mean          1.091615e+00           1.237145e-01  2.408399e+02   
std           1.057294e+00           1.127220e-01  3.239987e+00   
min           7.600000e-02           0.000000e+00  2.232000e+02   
25%           3.080000e-01           4.800000e-02  2.389900e+02   
50%           6.020000e-01           1.000000e-01  2.410100e+02   
75%           1.528000e+00           1.940000e-01  2.428900e+02   
max           1.112200e+01           1.390000e+00  2.541500e+02   

       Global_intensity  Sub_metering_1  
count      2.049280e+06    2.049280e+06  
mean       4.627759e+00    1.121923e+00  
std        4.444396e+00    6.153031e+00  
min        2.000000e-01    0.000000e+00  
25%        1.400000e+00    0.000000e+00  
50%        2.600000e+00    0.000000e+00  
75%        6.400000e+00    0.000000e+00  
max        4.840000e+01    8.800000e+01  


In [65]:
# Delete the rows with missing values
print(2.5)
print("number of missing values before dropna is:", df.isna().sum(), "\n")
df.dropna(inplace=True)
print("number of missing values after dropna is:", df.isna().sum())

2.5
number of missing values before dropna is: Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
dtype: int64 

number of missing values after dropna is: Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
dtype: int64
