## Essentials of Pandas

In [1]:
# Install package
!pip install pandas



In [2]:
# Import packages
import pandas as pd
import numpy as np

In [4]:
# Create a 1D array
arr1d = np.array([10, 20, 30, 40, 50])

num_series = pd.Series(arr1d)
num_series

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [5]:
type(num_series)

pandas.core.series.Series

In [7]:
# Create a Pandas Series
fruit_list = ["Orange", "Apple", "Banana", "Grapes", "Mango"]

fruits = pd.Series(fruit_list, name="fruits")
fruits

0    Orange
1     Apple
2    Banana
3    Grapes
4     Mango
Name: fruits, dtype: object

In [8]:
# Print series name
fruits.name

'fruits'

In [9]:
# Create 2D numy array
arr2d = np.array([[10, 20, 30], [40, 50, 60]])

num_df = pd.DataFrame(arr2d, columns=["A", "B", "C"])
num_df

Unnamed: 0,A,B,C
0,10,20,30
1,40,50,60


In [10]:
# Create a Series with custom index
fruits = pd.Series(["Orange", "Apple", "Banana", "Grapes", "Mango"],
                   name="fruits",
                   index=["a", "b", "c", "d", "e"])
fruits

a    Orange
b     Apple
c    Banana
d    Grapes
e     Mango
Name: fruits, dtype: object

In [12]:
# Create a Series with numbers
int_numbers = pd.Series([10, 20, 30], name="numbers")
int_numbers

0    10
1    20
2    30
Name: numbers, dtype: int64

In [15]:
# Create a Series with mixed data types
int_str = pd.Series([10, 20, 30, 'Orange'], name="mixed")
int_str

0        10
1        20
2        30
3    Orange
Name: mixed, dtype: object

In [17]:
# Create a Series with numbers
int_numbers = pd.Series([10, 20, 30], name="numbers")
int_numbers

0    10
1    20
2    30
Name: numbers, dtype: int64

In [18]:
# Create a Series with numbers and set data type
int_numbers = pd.Series([10, 20, 30], name="mixed", dtype="int8")
int_numbers

0    10
1    20
2    30
Name: mixed, dtype: int8

In [19]:
# Create a Series with numbers
num_series = pd.Series([10, 20, 30, 40])
num_series

0    10
1    20
2    30
3    40
dtype: int64

In [20]:
# Changing data type to float
num_series = num_series.astype("float64")
num_series

0    10.0
1    20.0
2    30.0
3    40.0
dtype: float64

In [24]:
# Python dictionary
data = {
    'Name': ['Sander', 'Pieter', 'Charlie', 'David'],
    'Age': [25, 32, 18, 47],
    'Salary': [50000, 80000, 20000, 120000]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary
0,Sander,25,50000
1,Pieter,32,80000
2,Charlie,18,20000
3,David,47,120000


In [25]:
names = ['Sander', 'Pieter', 'Charlie', 'David']
age = [25, 32, 18, 47]
salary = [50000, 80000, 20000, 120000]

df = pd.DataFrame({"Names": names, "Age": age, "Salary": salary})
df

Unnamed: 0,Names,Age,Salary
0,Sander,25,50000
1,Pieter,32,80000
2,Charlie,18,20000
3,David,47,120000


In [27]:
# Dataframe with missing values
names = ['Sander', 'Pieter', 'Charlie', 'David']
age = [25, None, 18, 47]
salary = [50000, 80000, None, 120000]

df = pd.DataFrame({"Names": names, "Age":age, "Salary": salary})
df

Unnamed: 0,Names,Age,Salary
0,Sander,25.0,50000.0
1,Pieter,,80000.0
2,Charlie,18.0,
3,David,47.0,120000.0


In [29]:
# Check for missing values
df.isna()

Unnamed: 0,Names,Age,Salary
0,False,False,False
1,False,True,False
2,False,False,True
3,False,False,False


In [30]:
# Check for missing values
df.isna().sum()

Names     0
Age       1
Salary    1
dtype: int64

In [32]:
# Remove missing values
# default axis=0 -> rows
df.dropna()

Unnamed: 0,Names,Age,Salary
0,Sander,25.0,50000.0
3,David,47.0,120000.0


In [33]:
# Remove missing values - Drop columns with any missing values
df.dropna(axis=1)

Unnamed: 0,Names
0,Sander
1,Pieter
2,Charlie
3,David


In [36]:
names = ['Alice', 'Bob', 'Charlie', 'David', 'John', 
         'Mpho', 'Steve', 'Ben']
age = [25, 29, 33, 21, 57, 66, 50, 30]

# Creating a DataFrame
df = pd.DataFrame({"Names": names, "Age":age})

# Viewing the first five rows
df.head()

Unnamed: 0,Names,Age
0,Alice,25
1,Bob,29
2,Charlie,33
3,David,21
4,John,57


In [37]:
# top 2 records
df.head(2)

Unnamed: 0,Names,Age
0,Alice,25
1,Bob,29


In [39]:
# Viewing the last 5 rows
df.tail()

Unnamed: 0,Names,Age
3,David,21
4,John,57
5,Mpho,66
6,Steve,50
7,Ben,30


In [40]:
# last 2 records
df.tail(2)

Unnamed: 0,Names,Age
6,Steve,50
7,Ben,30


In [41]:
# Summary of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Names   8 non-null      object
 1   Age     8 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 260.0+ bytes


In [42]:
# Descriptive statistics
df.describe()

Unnamed: 0,Age
count,8.0
mean,38.875
std,16.522171
min,21.0
25%,28.0
50%,31.5
75%,51.75
max,66.0


In [44]:
Products = ['Computers', 'Phones', 'Shoes', 'Computers', 'Phones']
Sales = [2500, 3000, 1400, 2100, 2800]

# Creating a DataFrame
df = pd.DataFrame({"Products": Products, "Sales":Sales})

# viewing the first 10 rows
df.head(10)

Unnamed: 0,Products,Sales
0,Computers,2500
1,Phones,3000
2,Shoes,1400
3,Computers,2100
4,Phones,2800


In [45]:
# Grouping products by the "Products" column and summing the sales
# SQL: SELECT SUM(Sales) FROM "df" GROUP BY Products
groupy_products = df.groupby("Products")["Sales"].sum()
groupy_products

Products
Computers    4600
Phones       5800
Shoes        1400
Name: Sales, dtype: int64

In [46]:
groupy_products = df.groupby("Products")["Sales"]
groupy_products

<pandas.core.groupby.generic.SeriesGroupBy object at 0x10ff5deb0>

In [48]:
# Sales records
Products = ['Computers', 'Phones', 'Shoes']
Sales = [2500, 3000, 1400]

# Creating a DataFrame
df_sales = pd.DataFrame({"Products": Products, "Sales":Sales})
# viewing the rows
df_sales

Unnamed: 0,Products,Sales
0,Computers,2500
1,Phones,3000
2,Shoes,1400


In [49]:
# Cost records
Products = ['Computers', 'Phones', 'Shoes']
costs = [1800, 2300, 1000]

# Creating a DataFrame
df_costs = pd.DataFrame({"Products": Products, "Costs":costs})
df_costs

Unnamed: 0,Products,Costs
0,Computers,1800
1,Phones,2300
2,Shoes,1000


In [51]:
# Merging the two DataFrames
merged_df = df_sales.merge(df_costs, how='left', on="Products")
merged_df

Unnamed: 0,Products,Sales,Costs
0,Computers,2500,1800
1,Phones,3000,2300
2,Shoes,1400,1000


In [52]:
# Calculate profit
merged_df["Profit"] = merged_df["Sales"] - merged_df["Costs"]
merged_df

Unnamed: 0,Products,Sales,Costs,Profit
0,Computers,2500,1800,700
1,Phones,3000,2300,700
2,Shoes,1400,1000,400


In [54]:
# Profit percentage
merged_df["ProfitPerc"] = round(merged_df["Profit"] / merged_df["Sales"] * 100, 2)
merged_df

Unnamed: 0,Products,Sales,Costs,Profit,ProfitPerc
0,Computers,2500,1800,700,28.0
1,Phones,3000,2300,700,23.33
2,Shoes,1400,1000,400,28.57


In [56]:
names = ['Alice', 'Bob', 'Charlie', 'David']
age = [25, 32, 18, 47]
salary = [50000, 80000, 20000, 120000]

df = pd.DataFrame({"Names": names, "Age":age, "Salary": salary})
df

Unnamed: 0,Names,Age,Salary
0,Alice,25,50000
1,Bob,32,80000
2,Charlie,18,20000
3,David,47,120000


In [59]:
# Select rows by label
three_rows = df.loc[1:3]
three_rows

Unnamed: 0,Names,Age,Salary
1,Bob,32,80000
2,Charlie,18,20000
3,David,47,120000


In [62]:
# [0,3] = Row 0, Row 3
# ['Names', 'Salary'] = Columns
alice_and_david = df.loc[[0,3], ['Names', 'Salary']]
alice_and_david

Unnamed: 0,Names,Salary
0,Alice,50000
3,David,120000


In [63]:
df

Unnamed: 0,Names,Age,Salary
0,Alice,25,50000
1,Bob,32,80000
2,Charlie,18,20000
3,David,47,120000


In [66]:
over_30 = df.loc[df["Age"] > 30]
over_30

Unnamed: 0,Names,Age,Salary
1,Bob,32,80000
3,David,47,120000


In [68]:
# Index location
# Start 0 -> Index 0 (row 1) in the dataframe
# End 2 (Not included) -> Index 1 (row 2) in the dataframe
alice_and_bob = df.iloc[0:2]
alice_and_bob

Unnamed: 0,Names,Age,Salary
0,Alice,25,50000
1,Bob,32,80000


In [69]:
df

Unnamed: 0,Names,Age,Salary
0,Alice,25,50000
1,Bob,32,80000
2,Charlie,18,20000
3,David,47,120000


In [None]:
# Selecting specific rows and columns
alice_bob_salaries = df.iloc[:2, [0,2]]
alice_bob_salaries

Unnamed: 0,Names,Salary
0,Alice,50000
1,Bob,80000
