In [1]:
import numpy as np
import pandas as pd

# Series

In [2]:
# Assigning Python objects to variables
aa = [1, 2, 3, 4]  # List
ab = (5, 6, 7, 8)  # Tuple
ac = {9, 10, 11, 12}  # Set
ad = {'first': 13, 'second': 14, 'third': 15, 'fourth': 16}  # Dictionary

# Creating pandas Series from existing variables
ser1 = pd.Series(aa)
ser2 = pd.Series(ab)
ser3 = pd.Series(list(ac))  # Convert the set to a list first
ser4 = pd.Series(ad)

# Printing the Series
print("ser1:\n", ser1)
print("\nser2:\n", ser2)
print("\nser3:\n", ser3)
print("\nser4:\n", ser4)

ser1:
 0    1
1    2
2    3
3    4
dtype: int64

ser2:
 0    5
1    6
2    7
3    8
dtype: int64

ser3:
 0     9
1    10
2    11
3    12
dtype: int64

ser4:
 first     13
second    14
third     15
fourth    16
dtype: int64


In [3]:
# Creating a Series with literal values (incorrectly)
ser5 = pd.Series(17, 18, 19, 20)

#TypeError:  Index(...) must be called with a collection of some kind, 18 was passed

TypeError: Index(...) must be called with a collection of some kind, 18 was passed

# Data Frames & Series

In [4]:
ser1 = pd.Series([1, 2, 3, 4, 5])  # Creating a Pandas Series with sample data

df1 = pd.DataFrame(ser1)  # Creating a DataFrame from the Series

df1  # Displaying the DataFrame

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5


In [5]:
# Creating a Pandas Series with different data types
ser1 = pd.Series([1, 2, 3, 4, 5])  # Series of integers
ser2 = pd.Series(['a', 'b', 'c', 'd', 'e'])  # Series of strings
ser3 = pd.Series([True, False, True, False, True])  # Series of booleans

# Creating a DataFrame using the first Series
df1 = pd.DataFrame(ser1)  

# Adding the other Series as new columns to the DataFrame
df1.insert(1, "Column 2", ser2)  # Insert 'ser2' as the second column
df1.insert(2, "Column 3", ser3)  # Insert 'ser3' as the third column

# Displaying the DataFrame
df1

Unnamed: 0,0,Column 2,Column 3
0,1,a,True
1,2,b,False
2,3,c,True
3,4,d,False
4,5,e,True


In [6]:
# I'm creating a DataFrame with 4 columns. The first three columns will have 5 rows of data, & the last column will only have 3 rows of data.

ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series(['a', 'b', 'c', 'd', 'e'])
ser3 = pd.Series([True, False, True, False, True])
ser4 = pd.Series([10, 20, 30])  # This series has fewer values than the others

df1 = pd.DataFrame(ser1)

df1.insert(1, "Column 2", ser2)
df1.insert(2, "Column 3", ser3)
df1.insert(3, "Column 4", ser4)  # Adding a column with a shorter series

#"Column 4" has missing values for the last two rows because `ser4` only had 3 values, while the other Series had 5 values. Pandas fills these missing values with NaN.

df1  # Displaying the DataFrame

Unnamed: 0,0,Column 2,Column 3,Column 4
0,1,a,True,10.0
1,2,b,False,20.0
2,3,c,True,30.0
3,4,d,False,
4,5,e,True,


In [7]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series(['a', 'b', 'c', 'd', 'e'])
ser3 = pd.Series([True, False, True, False, True])
ser4 = pd.Series([10, 20, 30])

df1 = pd.DataFrame(ser1)

df1.insert(1, "Column 2", ser2)
df1.insert(2, "Column 3", ser3)
df1.insert(3, "Column 4", ser4)

# Adding a new row with the loc attribute, ensuring the correct data type for each column
df1.loc[4] = [5, '9', True, 0]  # Use a string for 'Column 2' and a boolean for 'Column 3'

df1  # Displaying the updated DataFrame

Unnamed: 0,0,Column 2,Column 3,Column 4
0,1,a,True,10.0
1,2,b,False,20.0
2,3,c,True,30.0
3,4,d,False,
4,5,9,True,0.0


# Reading Data into a DataFrame

In [8]:
df2 = pd.read_csv('sakila_example.csv')  # Reading the CSV file into a DataFrame

df2  # Displaying the DataFrame

Unnamed: 0,title,description,category,language,actor_name
0,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,PENELOPE GUINESS
1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,CHRISTIAN GABLE
2,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,LUCILLE TRACY
3,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,SANDRA PECK
4,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,JOHNNY CAGE
...,...,...,...,...,...
257,ATTRACTION NEWTON,A Astounding Panorama of a Composer And a Fris...,New,English,GARY PENN
258,ATTRACTION NEWTON,A Astounding Panorama of a Composer And a Fris...,New,English,CHRISTOPHER WEST
259,AUTUMN CROW,A Beautiful Tale of a Dentist And a Mad Cow wh...,Games,English,DUSTIN TAUTOU
260,AUTUMN CROW,A Beautiful Tale of a Dentist And a Mad Cow wh...,Games,English,ANGELA HUDSON


In [9]:
df2.head()  # Displaying the top 5 rows of the DataFrame

Unnamed: 0,title,description,category,language,actor_name
0,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,PENELOPE GUINESS
1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,CHRISTIAN GABLE
2,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,LUCILLE TRACY
3,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,SANDRA PECK
4,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,English,JOHNNY CAGE


In [10]:
df2.tail(10)  # Displaying the last 10 rows of the DataFrame

Unnamed: 0,title,description,category,language,actor_name
252,ATTACKS HATE,A Fast-Paced Panorama of a Technical Writer An...,Sci-Fi,English,MILLA KEITEL
253,ATTACKS HATE,A Fast-Paced Panorama of a Technical Writer An...,Sci-Fi,English,GROUCHO DUNST
254,ATTACKS HATE,A Fast-Paced Panorama of a Technical Writer An...,Sci-Fi,English,BURT TEMPLE
255,ATTRACTION NEWTON,A Astounding Panorama of a Composer And a Fris...,New,English,UMA WOOD
256,ATTRACTION NEWTON,A Astounding Panorama of a Composer And a Fris...,New,English,RIP WINSLET
257,ATTRACTION NEWTON,A Astounding Panorama of a Composer And a Fris...,New,English,GARY PENN
258,ATTRACTION NEWTON,A Astounding Panorama of a Composer And a Fris...,New,English,CHRISTOPHER WEST
259,AUTUMN CROW,A Beautiful Tale of a Dentist And a Mad Cow wh...,Games,English,DUSTIN TAUTOU
260,AUTUMN CROW,A Beautiful Tale of a Dentist And a Mad Cow wh...,Games,English,ANGELA HUDSON
261,AUTUMN CROW,A Beautiful Tale of a Dentist And a Mad Cow wh...,Games,English,JAMES PITT


In [11]:
df2.info()  # Displaying a summary information about the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        262 non-null    object
 1   description  262 non-null    object
 2   category     262 non-null    object
 3   language     262 non-null    object
 4   actor_name   262 non-null    object
dtypes: object(5)
memory usage: 10.4+ KB


The DataFrame has 262 rows and 5 columns.
All columns are of type 'Object', likely indicating string values.
There are no missing values in the dataset.
The dataset is relatively small, using only 10.4+ KB of memory.

In [12]:
df2.describe()  # Displaying column statistics

Unnamed: 0,title,description,category,language,actor_name
count,262,262,262,262,262
unique,46,46,15,1,145
top,ARABIA DOGMA,A Touching Epistle of a Madman And a Mad Cow w...,Horror,English,OPRAH KILMER
freq,12,12,55,262,8


The 'category' column has 15 unique values, with 'Horror' being the most frequent (appearing 55 times)
The 'actor_name' column has 145 unique values, with 'OPRAH KILMER' appearing most frequently (8 times).

Less useful insights:

The 'title' and 'description' columns have high unique counts, making the 'top' and 'freq' less meaningful.
Since all columns are of 'Object' type, numerical statistics like mean, min, max are not applicable.

In [13]:
df3 = pd.read_csv('northwind_example.csv')  # Reading the CSV file into a DataFrame

df3  # Displaying the DataFrame

Unnamed: 0,OrderID,ProductName,UnitPrice,Quantity
0,10248,Queso Cabrales,14.0,12
1,10248,Singaporean Hokkien Fried Mee,9.8,10
2,10248,Mozzarella di Giovanni,34.8,5
3,10249,Manjimup Dried Apples,42.4,40
4,10249,Tofu,18.6,9
...,...,...,...,...
495,10435,Mozzarella di Giovanni,27.8,10
496,10436,Gnocchi di nonna Alice,30.4,40
497,10436,Wimmers gute Semmelkndel,26.6,30
498,10436,Rhnbru Klosterbier,6.2,24


In [14]:
df3.describe()  # Displaying column statistics for df3

Unnamed: 0,OrderID,UnitPrice,Quantity
count,500.0,500.0,500.0
mean,10341.364,23.3714,24.33
std,54.442663,27.334436,18.069702
min,10248.0,2.0,1.0
25%,10294.0,10.4,10.0
50%,10341.0,15.8,20.0
75%,10389.0,27.8,32.0
max,10436.0,210.8,120.0


The dataset contains information about 500 orders. The average order quantity is around 24 units.
The unit prices vary significantly, with a minimum of 2 and a maximum of 210.8. The standard deviation for 'UnitPrice' (27.33) indicates a wide spread in prices.

Differences in summarizations compared to the Sakila example:

df3 contains numerical data ('OrderID', 'UnitPrice', 'Quantity'), allowing for numerical statistics like mean, min, max, and std.
The Sakila example had mostly categorical data ('Object' type), the describe() method provided different statistics like count, unique, top, and freq.

## Coding a DataFrame

In [15]:
test_data = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8],
    'Name': ['Alicia', 'Bob', 'Charlie', 'Dominic', 'Eve', 'Frank', 'Grace', 'Heidi'],
    'Age': [25, np.nan, 22, 19, 31, 35, np.nan, 18],
    'City': ['NY', 'LA', 'Chicago', 'Houston', 'Phoenix', 'Boston', 'Austin', 'San Diego'],
    'Score': [85, 92, -1, 65, 78, np.nan, 55, 90]
}

In [16]:
# Creating a DataFrame from the test_data dictionary
df4 = pd.DataFrame(test_data)

# Displaying the DataFrame
df4

Unnamed: 0,ID,Name,Age,City,Score
0,1,Alicia,25.0,NY,85.0
1,2,Bob,,LA,92.0
2,3,Charlie,22.0,Chicago,-1.0
3,4,Dominic,19.0,Houston,65.0
4,5,Eve,31.0,Phoenix,78.0
5,6,Frank,35.0,Boston,
6,7,Grace,,Austin,55.0
7,8,Heidi,18.0,San Diego,90.0


In [18]:
# Exporting the DataFrame to a CSV file named 'test_data.csv'
df4.to_csv('test_data.csv', index=False)