# Basic Pandas Data Structure

In [108]:
import pandas as pd

In [109]:
# Series can be created from Python dict, arrays, or scalar values (like 1, 2, 3, etc.)
# To create a series, we can use the .Series() function.

# Creating a series from a Python list
s = pd.Series([1, 3, 5, 7, 9])
print(s)

0    1
1    3
2    5
3    7
4    9
dtype: int64


In [110]:
# Creating a series from a Python dict
d = {'a': 1, 'b': 3, 'c': 5, 'd': 7, 'e': 9}
s = pd.Series(d)
s

a    1
b    3
c    5
d    7
e    9
dtype: int64

In [111]:
# Creating a .Series() has Index and name parameters that can be set.
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'], name='example series')
s

a    1
b    2
c    3
d    4
e    5
Name: example series, dtype: int64

In [112]:
# DataFrame can be created from a Python dictionary, list, 2D array, object series, etc. Create
# using the .DataFrame() function.

# creating an empty DataFrame object and add columns and rows to it.
df1 = pd.DataFrame()
print(df1)

df1['name'] = ['John', 'Paul', 'George', 'Ringo']
df1['grade'] = [90, 80, 70, 60]
df1

Empty DataFrame
Columns: []
Index: []


Unnamed: 0,name,grade
0,John,90
1,Paul,80
2,George,70
3,Ringo,60


In [113]:
# Creating a DataFrame from a Python dictionary
dictionary = {
    'name': ['John', 'Paul', 'George', 'Ringo'],
    'grade': [90, 80, 70, 60],
}

df2 = pd.DataFrame(dictionary)
df2

Unnamed: 0,name,grade
0,John,90
1,Paul,80
2,George,70
3,Ringo,60


In [114]:
# Creating from a list, in which we can specify the column names.
a = [10, 20, 30, 40]
b = [50, 60, 70, 80]
c = [90, 100, 110, 120]

df3 = pd.DataFrame([a,b,c], columns=list("abcd"))

df3

Unnamed: 0,a,b,c,d
0,10,20,30,40
1,50,60,70,80
2,90,100,110,120


In [115]:
# Get the statistics of DataFrame
# The .describe() function is used to get the statistics of the DataFrame.
df3.describe()

Unnamed: 0,a,b,c,d
count,3.0,3.0,3.0,3.0
mean,50.0,60.0,70.0,80.0
std,40.0,40.0,40.0,40.0
min,10.0,20.0,30.0,40.0
25%,30.0,40.0,50.0,60.0
50%,50.0,60.0,70.0,80.0
75%,70.0,80.0,90.0,100.0
max,90.0,100.0,110.0,120.0


In [116]:
# DataFrame.info() is a function of DataFrame that gives metadata of DataFrame
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   a       3 non-null      int64
 1   b       3 non-null      int64
 2   c       3 non-null      int64
 3   d       3 non-null      int64
dtypes: int64(4)
memory usage: 224.0 bytes


# Working with Rows and Columns

In [117]:
# Working with Rows and Columns
# We will create a dummy dataset from a random integer number using the NumPy package.
import numpy as np

np.random.seed(0)
data = np.random.randint(0, 100, size=(5, 5)) # 5 rows and 5 columns

# Create a DataFrame from the data
df = pd.DataFrame(data, columns=list('abcde'))
df

Unnamed: 0,a,b,c,d,e
0,44,47,64,67,67
1,9,83,21,36,87
2,70,88,88,12,58
3,65,39,87,46,88
4,81,37,25,77,72


In [118]:
# get brief information about the DataFrame
print(df.index)
print(df.columns)

RangeIndex(start=0, stop=5, step=1)
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


## .drop()

In [119]:
print('Original DataFrame')
df

Original DataFrame


Unnamed: 0,a,b,c,d,e
0,44,47,64,67,67
1,9,83,21,36,87
2,70,88,88,12,58
3,65,39,87,46,88
4,81,37,25,77,72


In [120]:
# Drop rows from the DataFrame
df.drop(index=[4], columns=['c', 'd'], inplace=True)

In [121]:
print('DataFrame after dropping rows and columns')
df

DataFrame after dropping rows and columns


Unnamed: 0,a,b,e
0,44,47,67
1,9,83,87
2,70,88,58
3,65,39,88


# Indexing, Slicing, and Selecting a Subset of Data

In [135]:
np.random.seed(0)
data = np.random.randint(0, 100, size=(10, 4)) # 10 rows and 4 columns
df = pd.DataFrame(data, columns=list('abcd'))
df

Unnamed: 0,a,b,c,d
0,44,47,64,67
1,67,9,83,21
2,36,87,70,88
3,88,12,58,65
4,39,87,46,88
5,81,37,25,77
6,72,9,20,80
7,69,79,47,64
8,82,99,88,49
9,29,19,19,14


## selecting columns df['column_name']

In [136]:
df['a']

0    44
1    67
2    36
3    88
4    39
5    81
6    72
7    69
8    82
9    29
Name: a, dtype: int32

## selecting multiple columns df[['column_name1', 'column_name2']]

In [137]:
df[['a', 'c']]

Unnamed: 0,a,c
0,44,64
1,67,83
2,36,70
3,88,58
4,39,46
5,81,25
6,72,20
7,69,47
8,82,88
9,29,19


## selecting rows by label df.loc['row_label']

In [138]:
df.iloc[1]

a    67
b     9
c    83
d    21
Name: 1, dtype: int32

## selecting slice of rows

In [141]:
df.iloc[1:8]

Unnamed: 0,a,b,c,d
1,67,9,83,21
2,36,87,70,88
3,88,12,58,65
4,39,87,46,88
5,81,37,25,77
6,72,9,20,80
7,69,79,47,64


# selecting rows by boolean vector

In [146]:
# Select rows where column 'a' is greater than 50
df[df['a'] > 50]

Unnamed: 0,a,b,c,d
1,67,9,83,21
3,88,12,58,65
5,81,37,25,77
6,72,9,20,80
7,69,79,47,64
8,82,99,88,49


In [147]:
# Select rows where column 'a' is greater than 50 and column 'b' is greater than 50
df[(df['a'] > 50) & (df['b'] > 50)]

Unnamed: 0,a,b,c,d
7,69,79,47,64
8,82,99,88,49


# Splitting data

# Split the DataFrame using .iloc[] by rows

In [151]:
print('default DataFrame')
df

default DataFrame


Unnamed: 0,a,b,c,d
0,44,47,64,67
1,67,9,83,21
2,36,87,70,88
3,88,12,58,65
4,39,87,46,88
5,81,37,25,77
6,72,9,20,80
7,69,79,47,64
8,82,99,88,49
9,29,19,19,14


# Split the DataFrame using .iloc[] by rows

In [161]:
df1 = df.iloc[:5, :] # :5 means splitting rows from index 0 to 4
df1

Unnamed: 0,a,b,c,d
0,44,47,64,67
1,67,9,83,21
2,36,87,70,88
3,88,12,58,65
4,39,87,46,88


In [162]:
df2 = df.iloc[5:, :] # 5: means splitting rows from index 5 to the end
df2

Unnamed: 0,a,b,c,d
5,81,37,25,77
6,72,9,20,80
7,69,79,47,64
8,82,99,88,49
9,29,19,19,14


# Split the DataFrame using .iloc[] by columns

In [163]:
df3 = df.iloc[:, :2] # :2 means splitting columns from index 0 to 2
df3

Unnamed: 0,a,b
0,44,47
1,67,9
2,36,87
3,88,12
4,39,87
5,81,37
6,72,9
7,69,79
8,82,99
9,29,19


In [164]:
df4 = df.iloc[:, 2:] # 2: means splitting columns from index 2 to the end
df4

Unnamed: 0,c,d
0,64,67
1,83,21
2,70,88
3,58,65
4,46,88
5,25,77
6,20,80
7,47,64
8,88,49
9,19,14


## # split the dictionary data

In [195]:
tech_companies = {
    'brand': ['Apple', 'Google', 'Microsoft', 'Amazon', 'Facebook', 'Twitter', 'Alibaba', 'Tencent'],
    'founded': [1976, 1998, 1975, 1994, 2004, 2006, 1999, 1998],
    'ceo': ['Tim Cook', 'Sundar Pichai', 'Satya Nadella', 'Jeff Bezos', 'Mark Zuckerberg', 'Jack Dorsey', 'Daniel Zhang', 'Ma Huateng'],
    'industry': ['Hardware', 'Search Engine', 'Software', 'E-commerce', 'Social Media', 'Social Media', 'E-commerce', 'Social Media']
}

df = pd.DataFrame(tech_companies)
df

Unnamed: 0,brand,founded,ceo,industry
0,Apple,1976,Tim Cook,Hardware
1,Google,1998,Sundar Pichai,Search Engine
2,Microsoft,1975,Satya Nadella,Software
3,Amazon,1994,Jeff Bezos,E-commerce
4,Facebook,2004,Mark Zuckerberg,Social Media
5,Twitter,2006,Jack Dorsey,Social Media
6,Alibaba,1999,Daniel Zhang,E-commerce
7,Tencent,1998,Ma Huateng,Social Media


In [197]:
# Split the DataFrame using .iloc[] by rows
df1 = df.iloc[:2, :] # :3 means splitting rows from index 0 to 2
df1

Unnamed: 0,brand,founded,ceo,industry
0,Apple,1976,Tim Cook,Hardware
1,Google,1998,Sundar Pichai,Search Engine


In [198]:
df2 = df.iloc[2:, :] # 3: means splitting rows from index 3 to the end
df2

Unnamed: 0,brand,founded,ceo,industry
2,Microsoft,1975,Satya Nadella,Software
3,Amazon,1994,Jeff Bezos,E-commerce
4,Facebook,2004,Mark Zuckerberg,Social Media
5,Twitter,2006,Jack Dorsey,Social Media
6,Alibaba,1999,Daniel Zhang,E-commerce
7,Tencent,1998,Ma Huateng,Social Media


In [199]:
# Split the DataFrame using .iloc[] by columns
df3 = df.iloc[:, :2] # :2 means splitting columns from index 0 to 2
df3

Unnamed: 0,brand,founded
0,Apple,1976
1,Google,1998
2,Microsoft,1975
3,Amazon,1994
4,Facebook,2004
5,Twitter,2006
6,Alibaba,1999
7,Tencent,1998


In [200]:
df4 = df.iloc[:, 2:] # 2: means splitting columns from index 2 to the end
df4

Unnamed: 0,ceo,industry
0,Tim Cook,Hardware
1,Sundar Pichai,Search Engine
2,Satya Nadella,Software
3,Jeff Bezos,E-commerce
4,Mark Zuckerberg,Social Media
5,Jack Dorsey,Social Media
6,Daniel Zhang,E-commerce
7,Ma Huateng,Social Media


## split DataFrame using .groupby()

In [201]:
grouped = df.groupby('industry')
grouped.groups

{'E-commerce': [3, 6], 'Hardware': [0], 'Search Engine': [1], 'Social Media': [4, 5, 7], 'Software': [2]}

In [202]:
grouped.get_group('E-commerce')

Unnamed: 0,brand,founded,ceo,industry
3,Amazon,1994,Jeff Bezos,E-commerce
6,Alibaba,1999,Daniel Zhang,E-commerce


In [203]:
grouped.get_group('Social Media')

Unnamed: 0,brand,founded,ceo,industry
4,Facebook,2004,Mark Zuckerberg,Social Media
5,Twitter,2006,Jack Dorsey,Social Media
7,Tencent,1998,Ma Huateng,Social Media


## Split the DataFrame using .sample() (shuffle rows)

In [204]:
# n = number of rows to return
df6 = df.sample(n=2)
df6

Unnamed: 0,brand,founded,ceo,industry
2,Microsoft,1975,Satya Nadella,Software
4,Facebook,2004,Mark Zuckerberg,Social Media


In [206]:
# frac means the fraction of rows to return
df7 = df.sample(frac=0.5)
df7

Unnamed: 0,brand,founded,ceo,industry
6,Alibaba,1999,Daniel Zhang,E-commerce
7,Tencent,1998,Ma Huateng,Social Media
1,Google,1998,Sundar Pichai,Search Engine
4,Facebook,2004,Mark Zuckerberg,Social Media


# Merge data

## Merge two DataFrames using .merge()
Use .merge() to merge DataFrames by matching their
index. The value of left_index and right_index
parameters of .merge() should be True.

In [209]:
# Create a DataFrame for employees
df_employees = pd.DataFrame({
    'employee_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'department': ['HR', 'IT', 'Finance', 'Marketing']
})

# Create a DataFrame for salaries
df_salaries = pd.DataFrame({
    'employee_id': [1, 2, 3, 4],
    'salary': [70000, 80000, 90000, 85000]
})

print('Employee DataFrame')
print(df_employees)

print('Salaries DataFrame')
print(df_salaries)

# Merge the DataFrames
df_merged = pd.merge(df_employees, df_salaries, on='employee_id')
df_merged

Employee DataFrame
   employee_id     name department
0            1    Alice         HR
1            2      Bob         IT
2            3  Charlie    Finance
3            4    David  Marketing
Salaries DataFrame
   employee_id  salary
0            1   70000
1            2   80000
2            3   90000
3            4   85000


Unnamed: 0,employee_id,name,department,salary
0,1,Alice,HR,70000
1,2,Bob,IT,80000
2,3,Charlie,Finance,90000
3,4,David,Marketing,85000


Use .concat() in pandas works by combining Dataframes across rows
or columns. We can concat two or more data frames either along
rows (axis=0) or along columns (axis=1)

In [216]:
# DataFrame Left
left = pd.DataFrame({
    "A": ["A0", "A1", "A2"],
    "B": ["B0", "B1", "B2"]
}, index=["K0", "K1", "K2"])

# DataFrame right
right = pd.DataFrame({
    "C": ["C0", "C2", "C3"],
    "D": ["D0", "D2", "D3"]
}, index=["K0", "K2", "K3"])

# no axis specified, so it will concatenate vertically
concat = pd.concat([left, right])
concat

Unnamed: 0,A,B,C,D
K0,A0,B0,,
K1,A1,B1,,
K2,A2,B2,,
K0,,,C0,D0
K2,,,C2,D2
K3,,,C3,D3


In [215]:
# Axis=0 means concatenate vertically
concat1 = pd.concat([left, right], axis=0)
concat1

Unnamed: 0,A,B,C,D
K0,A0,B0,,
K1,A1,B1,,
K2,A2,B2,,
K0,,,C0,D0
K2,,,C2,D2
K3,,,C3,D3


In [213]:
# Axis=1 means concatenate horizontally
concat2 = pd.concat([left, right], axis=1)
concat2

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,,,C3,D3
