This set of exercises focuses on using Pandas, a powerful library for data manipulation and analysis in Python. You'll learn to create and manipulate DataFrames, work with real-world datasets, handle missing values, and perform various data operations. The exercises cover key Pandas functionalities including data loading, cleaning, transformation, and basic analysis.

In [30]:
import pandas as pd

import numpy as np
print(np.__version__)


2.2.2


In [31]:
# 1.1
print(1.1)
print("NumPy:")
# Create a NumPy array
data_np = np.array([
    ["Blue", [1, 2], 1.1],
    ["Red", [3, 4], 2.2],
    ["Pink", [5, 6], 3.3],
    ["Grey", [7, 8], 4.4],
    ["Black", [9, 10], 5.5]
], dtype=object)

# Create DataFrame
df_np = pd.DataFrame(
    data_np,
    columns=["color", "list", "number"],
    index=[1, 3, 5, 7, 9]
    )

print("DataFrame from NumPy array:")
print(df_np)

# Create Series
colors = pd.Series(["Blue", "Red", "Pink", "Grey", "Black"], index=[1, 3, 5, 7, 9])
lists = pd.Series([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], index=[1, 3, 5, 7, 9])
numbers = pd.Series([1.1, 2.2, 3.3, 4.4, 5.5], index=[1, 3, 5, 7, 9])

# Create DataFrame
df_series = pd.DataFrame({"color": colors, "list": lists, "number": numbers})

print("\nDataFrame from Pandas Series:")
print(df_series, "\n")

# 1.2
print(1.2)
print("First value types:")
print(type(df_series["color"]))  
print(type(df_series["list"]))  
print(type(df_series["number"])) 

print("\nFirst value types:")
df_np["number"] = df_np["number"].astype(np.float64)
print(type(df_np.iloc[0, 0]), type(df_np.iloc[0, 1]), type(df_np.iloc[0, 2]))



1.1
NumPy:
DataFrame from NumPy array:
   color     list number
1   Blue   [1, 2]    1.1
3    Red   [3, 4]    2.2
5   Pink   [5, 6]    3.3
7   Grey   [7, 8]    4.4
9  Black  [9, 10]    5.5

DataFrame from Pandas Series:
   color     list  number
1   Blue   [1, 2]     1.1
3    Red   [3, 4]     2.2
5   Pink   [5, 6]     3.3
7   Grey   [7, 8]     4.4
9  Black  [9, 10]     5.5 

1.2
First value types:
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>

First value types:
<class 'str'> <class 'list'> <class 'numpy.float64'>


In [32]:

# Exercise 2: Electric power consumption

url = "https://assets.01-edu.org/ai-branch/piscine-ai/household_power_consumption.txt"
# Load the dataset
df = pd.read_csv(url, delimiter=";", low_memory=False)
df["Global_active_power"] = pd.to_numeric(df["Global_active_power"], errors="coerce")
df["Global_reactive_power"] = pd.to_numeric(df["Global_reactive_power"], errors="coerce")
df["Voltage"] = pd.to_numeric(df["Voltage"], errors="coerce")




# Delete the columns Time, Sub_metering_2 and Sub_metering_3
print(2.1)
df.drop(columns=["Time", "Sub_metering_2", "Sub_metering_3"], inplace=True)
print("The axis parameter defaults to axis=1 for columns, so there's no need to explicitly specify it.")

# Set Date as index
print("\n2.2")
# Convert 'Date' to datetime
df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y")
# Set 'Date' as the index
df.set_index("Date", inplace=True)
print(df.head().index)



2.1
The axis parameter defaults to axis=1 for columns, so there's no need to explicitly specify it.

2.2
DatetimeIndex(['2006-12-16', '2006-12-16', '2006-12-16', '2006-12-16',
               '2006-12-16'],
              dtype='datetime64[ns]', name='Date', freq=None)


In [33]:
# Create a function that takes as input the DataFrame with the data set and returns a DataFrame with updated types:
print(2.3)

def update_types(df):
    # Convert specific columns to their correct types
    df["Global_active_power"] = pd.to_numeric(df["Global_active_power"], errors="coerce")
    df["Global_reactive_power"] = pd.to_numeric(df["Global_reactive_power"], errors="coerce")
    df["Global_intensity"] = pd.to_numeric(df["Global_intensity"], errors="coerce")
    df["Sub_metering_1"] = pd.to_numeric(df["Sub_metering_1"], errors="coerce")
    
    return df

df = update_types(df)
print(df.dtypes)

2.3
Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
dtype: object


In [34]:
# Use describe to have an overview on the data set
print(2.4)

print(df.describe())


2.4
       Global_active_power  Global_reactive_power       Voltage  \
count         2.049280e+06           2.049280e+06  2.049280e+06   
mean          1.091615e+00           1.237145e-01  2.408399e+02   
std           1.057294e+00           1.127220e-01  3.239987e+00   
min           7.600000e-02           0.000000e+00  2.232000e+02   
25%           3.080000e-01           4.800000e-02  2.389900e+02   
50%           6.020000e-01           1.000000e-01  2.410100e+02   
75%           1.528000e+00           1.940000e-01  2.428900e+02   
max           1.112200e+01           1.390000e+00  2.541500e+02   

       Global_intensity  Sub_metering_1  
count      2.049280e+06    2.049280e+06  
mean       4.627759e+00    1.121923e+00  
std        4.444396e+00    6.153031e+00  
min        2.000000e-01    0.000000e+00  
25%        1.400000e+00    0.000000e+00  
50%        2.600000e+00    0.000000e+00  
75%        6.400000e+00    0.000000e+00  
max        4.840000e+01    8.800000e+01  


In [35]:
# Delete the rows with missing values
print(2.5)
print("number of missing values before dropna is:", df.isna().sum(), "\n")
df.dropna(inplace=True)
print("number of missing values after dropna is:", df.isna().sum())

2.5
number of missing values before dropna is: Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
dtype: int64 

number of missing values after dropna is: Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
dtype: int64


In [36]:
# Modify Sub_metering_1 by adding 1 to it and multiplying the total by 0.06. If x is a row the output is: (x+1)*0.06
print(2.6)
df.loc[:,'Sub_metering_1'] = (df['Sub_metering_1'] + 1) * 0.06

print(df.loc[:,'Sub_metering_1'])

2.6
Date
2006-12-16    0.06
2006-12-16    0.06
2006-12-16    0.06
2006-12-16    0.06
2006-12-16    0.06
              ... 
2010-11-26    0.06
2010-11-26    0.06
2010-11-26    0.06
2010-11-26    0.06
2010-11-26    0.06
Name: Sub_metering_1, Length: 2049280, dtype: float64


In [48]:
# Select all the rows for which the Date is greater or equal than 2008-12-27 and Voltage is greater or equal than 242
print(2.7)
# Ensure index is in datetime format
df.index = pd.to_datetime(df.index, format="%d/%m/%Y")

# Filter rows where Date >= '2008-12-27' and Voltage >= 242
filtered_df = df.loc[(df.index >= "2008-12-27") & (df["Voltage"] >= 242)]

print(filtered_df.head().to_markdown())
print("\nNumber of rows in filtered_df is: ", len(filtered_df))


2.7
| Date                |   Global_active_power |   Global_reactive_power |   Voltage |   Global_intensity |   Sub_metering_1 |
|:--------------------|----------------------:|------------------------:|----------:|-------------------:|-----------------:|
| 2008-12-27 00:00:00 |                 0.996 |                   0.066 |    244.81 |                4   |             0.06 |
| 2008-12-27 00:00:00 |                 1.076 |                   0.162 |    244.78 |                4.4 |             0.06 |
| 2008-12-27 00:00:00 |                 1.064 |                   0.172 |    244.74 |                4.4 |             0.06 |
| 2008-12-27 00:00:00 |                 1.07  |                   0.174 |    245.28 |                4.4 |             0.06 |
| 2008-12-27 00:00:00 |                 0.804 |                   0.184 |    246.3  |                3.4 |             0.06 |

umber of rows in filtered_df is:  449667


In [50]:
# Print the 88888th row.
print(2.8)

print(df.iloc[88887])

2.8
Global_active_power        0.254
Global_reactive_power      0.000
Voltage                  238.100
Global_intensity           1.200
Sub_metering_1             0.060
Name: 2007-02-16 00:00:00, dtype: float64


In [57]:
# What is the date for which the Global_active_power is maximal ?
print(2.9)

# Find the date with the maximal value for 'Global_active_power'
max_global_active_power_row = df.loc[df["Global_active_power"].idxmax()]

# Extract the date corresponding to the maximal global active power
max_global_active_power_date = max_global_active_power_row.index
print(max_global_active_power_date[0])


2.9
2009-02-22 00:00:00


In [67]:
# Sort the first three columns by descending order of Global_active_power and ascending order of Voltage.
print(2.10)

first_three_columns = df.iloc[:, :3]
# Sort the first three columns by descending order of 'Global_active_power' and ascending order of 'Voltage'
sorted_df = first_three_columns.sort_values(by=["Global_active_power", "Voltage"], ascending=[False, True])

print(sorted_df.tail().to_markdown())



2.1
| Date                |   Global_active_power |   Global_reactive_power |   Voltage |
|:--------------------|----------------------:|------------------------:|----------:|
| 2008-08-28 00:00:00 |                 0.076 |                       0 |    234.88 |
| 2008-08-28 00:00:00 |                 0.076 |                       0 |    235.18 |
| 2008-08-28 00:00:00 |                 0.076 |                       0 |    235.4  |
| 2008-08-28 00:00:00 |                 0.076 |                       0 |    235.64 |
| 2008-08-12 00:00:00 |                 0.076 |                       0 |    236.5  |


In [68]:
# Compute the daily average of Global_active_power.
print(2.11)
# Compute the daily average of 'Global_active_power'
daily_avg = df['Global_active_power'].resample('D').mean()

# Print the result
print(daily_avg)


2.11
Date
2006-12-16    3.053475
2006-12-17    2.354486
2006-12-18    1.530435
2006-12-19    1.157079
2006-12-20    1.545658
                ...   
2010-11-22    1.417733
2010-11-23    1.095511
2010-11-24    1.247394
2010-11-25    0.993864
2010-11-26    1.178230
Freq: D, Name: Global_active_power, Length: 1442, dtype: float64


In [100]:
# Exercise 3: E-commerce purchases

import requests
import csv
from io import StringIO

url = "https://learn.01founders.co/git/root/public/src/branch/master/subjects/ai/pandas/data/Ecommerce_purchases.txt"

# Fetch the content using requests
response = requests.get(url)
response.raise_for_status()  # Raise an error if the request fails

print(response.text[:500])  # Print first 500 characters to inspect the format


# Use StringIO to treat the content as a file-like object for pandas to read
# Specify quoting=csv.QUOTE_MINIMAL to handle multiline fields and ensure proper quoting
# Read the response into a pandas dataframe
ecommerce_df = pd.read_csv(StringIO(response.text), sep="\n")


# Check the number of rows
print("Number of rows (if this number isnt 10000 then youve failed):")
print(len(ecommerce_df))

print("Number of columns (if this number isnt 14 then youve failed):")
num_columns = ecommerce_df.shape[1]
print(f"Number of columns: {num_columns}")

<!DOCTYPE html>
<html lang="en-US" class="theme-">
<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<title>file.title%!(EXTRA string=public/Ecommerce_purchases.txt, string=master) -  public - Gitea</title>
	<link rel="manifest" href="data:application/json;base64,eyJuYW1lIjoiR2l0ZWEiLCJzaG9ydF9uYW1lIjoiR2l0ZWEiLCJzdGFydF91cmwiOiJodHRwczovL2xlYXJuLjAxZm91bmRlcnMuY28vZ2l0LyIsImljb25zIjpbeyJzcmMiOiJodHRwczovL2xlYXJuLjAxZm91bmRlcnMuY28vZ2l0L2Fzc2V0


ParserError: Error tokenizing data. C error: Expected 1 fields in line 5, saw 2
