## Building a simple ETL pipeline ( CSV into Postgre Database )

<!-- Step 1: Extract the data from a CSV file and load into a Pandas dataframe
Step 2: Transform the data ( Remove duplicates, missing data, run calculations etc)
Step 3: Create a database
Step 4: Load the transformed data into the database -->

In [3]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install psycopg2

Collecting psycopg2
  Downloading psycopg2-2.9.10-cp313-cp313-win_amd64.whl.metadata (4.8 kB)
Downloading psycopg2-2.9.10-cp313-cp313-win_amd64.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.6 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.6 MB ? eta -:--:--
   -------- ------------------------------- 0.5/2.6 MB 788.2 kB/s eta 0:00:03
   ------------ --------------------------- 0.8/2.6 MB 843.2 kB/s eta 0:00:03
   ------------ --------------------------- 0.8/2.6 MB 843.2 kB/s eta 0:00:03
   ---------------- ----------------------- 1.0/2.6 MB 845.7 kB/s eta 0:00:02
   -------------------- ------------------- 1.3/2.6 MB 889.6 kB/s eta 0:00:02
   -------------------- ------------------- 1.3/2.6 MB 889.6 kB/s eta 0:00:02
   ------------------------ --

In [6]:
pip install sqlalchemy

Collecting sqlalchemy
  Downloading sqlalchemy-2.0.41-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Downloading greenlet-3.2.3-cp313-cp313-win_amd64.whl.metadata (4.2 kB)
Downloading sqlalchemy-2.0.41-cp313-cp313-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.1 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.1 MB ? eta -:--:--
   --------- ------------------------------ 0.5/2.1 MB 703.8 kB/s eta 0:00:03
   -------------- ------------------------- 0.8/2.1 MB 771.5 kB/s eta 0:00:02
   ------------------- -------------------- 1.0/2.1 MB 813.3 kB/s eta 0:00:02
   ------------------- -------------------- 1.0/2.1 MB 813.3 kB/s eta 0:00:02
   ------------------------ --------------- 1.3/2.1 MB 796.0 kB/s 

In [1]:
#import libraries
import pandas as pd
import psycopg2 #for connecting python to postgresql
from sqlalchemy import create_engine #to efficiently manage and reuse database connection

In [9]:
df=pd.read_csv(r"C:\Users\achar\Desktop\Utilities\DataEngineering\ETL_Project\MOCK_DATA.csv")

In [10]:
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Ede,Keneleyside,ekeneleyside0@ustream.tv,Female,160.207.123.12
1,2,Shelia,Swanson,sswanson1@engadget.com,Bigender,131.29.109.71
2,3,Porty,Jochanany,pjochanany2@shutterfly.com,Male,16.236.109.154
3,4,Ellis,Ovill,eovill3@shareasale.com,Male,199.86.130.56
4,5,Gerrie,Malecky,gmalecky4@dell.com,Female,82.184.43.12


In [11]:
df.tail()

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
4995,4996,Kym,Antham,kanthamrn@unicef.org,Female,138.50.208.39
4996,4997,Mitch,Glasser,mglasserro@comsenz.com,Male,4.101.187.211
4997,4998,Niko,Fairrie,nfairrierp@java.com,Male,241.78.241.116
4998,4999,Ally,Moyle,amoylerq@4shared.com,Female,96.69.154.170
4999,5000,Bartholomeo,Letham,blethamrr@oaic.gov.au,Male,213.157.56.9


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          5000 non-null   int64 
 1   first_name  5000 non-null   object
 2   last_name   5000 non-null   object
 3   email       4423 non-null   object
 4   gender      5000 non-null   object
 5   ip_address  4670 non-null   object
dtypes: int64(1), object(5)
memory usage: 234.5+ KB


In [20]:
df.isna().sum()

id              0
first_name      0
last_name       0
email         577
gender          0
ip_address    330
dtype: int64

In [24]:
#missing data in email column

df[df['email'].isna()].head()

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
1004,1005,Cooper,Winspire,,Male,133.5.122.84
1008,1009,Rodolfo,Baird,,Male,121.125.252.79
1010,1011,Caren,Joncic,,Female,37.166.247.72
1019,1020,Florenza,Lawton,,Female,83.21.54.135
1028,1029,Davina,Poluzzi,,Female,238.108.161.201


In [26]:
#fill in the missing data in email column

df['email'].fillna('Unknown',inplace=True)

In [29]:
#fill in the missing data in ip_address column

df['ip_address'].fillna(0,inplace=True)

In [28]:
df.isna().sum()

id            0
first_name    0
last_name     0
email         0
gender        0
ip_address    0
dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          5000 non-null   int64 
 1   first_name  5000 non-null   object
 2   last_name   5000 non-null   object
 3   email       5000 non-null   object
 4   gender      5000 non-null   object
 5   ip_address  5000 non-null   object
dtypes: int64(1), object(5)
memory usage: 234.5+ KB


In [32]:
#Load the dataset into the database

#Database credentials
db_username = 'postgres'
db_password = 'postgres'
db_host = 'localhost'
db_port = 5432
db_name = 'postgres'

In [34]:
#Establish a connection using SQLAlchemy engine

connection = create_engine(f'postgresql://{db_username}:{db_password}@{db_port}/{db_name}')

In [35]:
# Load the dataset into the postgre database

df.to_sql('emp_table',connection, if_exists='replace', index=False)

connection.dispose()

OperationalError: (psycopg2.OperationalError) could not translate host name "5432" to address: Name or service not known

(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [42]:
import psycopg2

try:
    conn = psycopg2.connect(
        db_username = 'postgres',
        db_password = 'postgres',
        db_host = 'localhost',
        db_port = 5432,
        db_name = 'postgres',
    )
    print("Connection successful.")
    conn.close()
except Exception as e:
    print("Connection failed:", e)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (1082844597.py, line 5)