## in this notbook we are going to do making data warehousing first we connect postgresql to jupiter notbook for data cleaning
## After some data cleaning we will conect jupiter notbook with bigQuery's for making data warehousing 

## import libaries 

In [5]:
import pandas as pd              # Data manipulation with DataFrames
from sqlalchemy import create_engine, text  # DB connection and executing SQL
import datetime                  # Work with dates and times
from google.cloud import bigquery  # Interact with BigQuery
from google.oauth2 import service_account  # Authenticate with Google Cloud
import warnings
warnings.filterwarnings('ignore')  # Suppress warning messages


## connect with postgresql 

In [6]:
# Connection parameters
db_user = "postgres"         # your PostgreSQL username
db_password = "tahir"             # Please enter your PostgreSQL password
db_host = "localhost"        # host (use '127.0.0.1' if localhost doesn't work)
db_port = "5432"             # default PostgreSQL port
db_name = "sales_db"     # your database name

# Create the connection string
connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

# Create engine
engine = create_engine(connection_string)

## test the connection 

In [7]:

try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT version();"))
        for row in result:
            print("Connected to:", row[0])
except Exception as e:
    print("Connection failed:", e)


Connected to: PostgreSQL 17.2 on x86_64-windows, compiled by msvc-19.42.34435, 64-bit


In [16]:
query = text("SELECT * FROM items;")
query


<sqlalchemy.sql.elements.TextClause object at 0x000001FE4F51C320>

In [17]:
# Fetch data into pandas DataFrame
df = pd.read_sql(query, engine)

In [18]:
# Show the data
df.head()

Unnamed: 0,item_id,item_name,item_type,item_price,amount_in_stock
0,1,Shirt,Clothing,20.99,50
1,2,Pants,Clothing,29.99,40
2,3,Shoes,Shoes,49.99,30
3,4,Hat,Accessories,9.99,60
4,5,Socks,Accessories,3.99,70


## Data cleaning 

In [19]:
df.isnull().sum()

item_id            0
item_name          0
item_type          0
item_price         0
amount_in_stock    0
dtype: int64

In [20]:
df.dtypes

item_id              int64
item_name           object
item_type           object
item_price         float64
amount_in_stock      int64
dtype: object

In [21]:
df.duplicated().sum()

0

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   item_id          60 non-null     int64  
 1   item_name        60 non-null     object 
 2   item_type        60 non-null     object 
 3   item_price       60 non-null     float64
 4   amount_in_stock  60 non-null     int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 2.5+ KB


In [24]:
df.describe()

Unnamed: 0,item_id,item_price,amount_in_stock
count,60.0,60.0,60.0
mean,30.5,25.24,42.5
std,17.464249,15.494805,18.537639
min,1.0,3.99,5.0
25%,15.75,12.74,28.75
50%,30.5,22.99,45.0
75%,45.25,34.99,56.25
max,60.0,69.99,75.0


# Connect with BigQuery

In [16]:
credentials = service_account.Credentials.from_service_account_file("datawarehouse8-550da3f60e12.json")
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

print("Connected to BigQuery project:", client.project)


Connected to BigQuery project: datawarehouse8


In [25]:
dataset_ref = bigquery.Dataset(f"{client.project}.datawarehouse8")
dataset = client.create_dataset(dataset_ref, exists_ok=True)
print("✅ Dataset created or already exists:", dataset.dataset_id)


✅ Dataset created or already exists: datawarehouse8


In [26]:
dataset_id = "datawarehouse8"          # your dataset name
table_name = "sales_data"              # your table name
table_id = f"{client.project}.{dataset_id}.{table_name}"

# Load DataFrame to BigQuery
job = client.load_table_from_dataframe(df, table_id)
job.result()

print("✅ Data successfully loaded to BigQuery:", table_id)


✅ Data successfully loaded to BigQuery: datawarehouse8.datawarehouse8.sales_data
