In [2]:
import pandas as pd
import numpy as np

In [44]:
!pip install google-cloud-storage
!pip install gcsfs
!pip install sqlalchemy pandas psycopg2
!pip install python-dotenv
!pip install kaggle


Collecting kaggle
  Downloading kaggle-1.5.16.tar.gz (83 kB)
     ---------------------------------------- 83.6/83.6 kB 2.4 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py): started
  Building wheel for kaggle (setup.py): finished with status 'done'
  Created wheel for kaggle: filename=kaggle-1.5.16-py3-none-any.whl size=110686 sha256=09cce12812b0e14374c05c83f388565c4b838e396e7b5fea85c11b1044cb5dca
  Stored in directory: c:\users\18572\appdata\local\pip\cache\wheels\d2\ed\a5\da3a0cfb13373d1ace41cafa4f2467d858c55c52473ba72799
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.5.16


In [31]:
# Monthly Data Ingestion

from google.cloud import storage
import os
import pandas as pd
import os
import psycopg2
from concurrent.futures import ThreadPoolExecutor
from psycopg2.extensions import register_adapter, AsIs
register_adapter(np.int64, AsIs)

# set key credentials file path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'C:/Disk_D/Course Work/Data Warehousing/Project-2/Keys/alien-grove-405422-bb57fda72219.json'

def connect_db_bulk(df,tbname):
    #Get Credentials
    db_host = os.getenv("DB_HOST")
    db_port = os.getenv("DB_PORT")
    db_name = os.getenv("DB_NAME")
    db_user = os.getenv("DB_USER")
    db_password = os.getenv("DB_PASSWORD")

    # Use the variables in your database connection logic
    connection_params = {
        'host': db_host,
        'port': db_port,
        'database': db_name,
        'user': db_user,
        'password': db_password,
    }
    # Establish a connection to your PostgreSQL database
    connection = psycopg2.connect(**connection_params)

    # Create a cursor to execute SQL statements
    cursor = connection.cursor()
    table_name = tbname
    schema_name = 'public'
    
    insert_query = f"INSERT INTO {schema_name}.{table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"

    records = df.to_records(index=False)
    values = [tuple(record) for record in records]

    cursor.executemany(insert_query, values)

    connection.commit()
    cursor.close()
    connection.close()

def connect_db_bulk_parallel(df):
    # Split the DataFrame into chunks for parallel processing
    chunk_size = 5000  # Adjust the chunk size as needed
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Create a ThreadPoolExecutor with the desired number of threads
    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system capacity
        # Submit each chunk for parallel processing
        futures = [executor.submit(connect_db_bulk, chunk,'monthly') for chunk in chunks]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

def list_cs_files(bucket_name,prefix,delimiter=None): 
    storage_client = storage.Client()
    data_dict = {"daily": [], "monthly": [], "weekly": []}
    file_list = storage_client.list_blobs(bucket_name,prefix=prefix,delimiter=delimiter)
    file_list = [file.name for file in file_list]
    new_list = [s.split('/')[-1] for s in file_list]
    monthly_files = [file for file in new_list if 'monthly' in file]
    for file in monthly_files:
        gcs_path = f'gs://{bucket_name}/{prefix}/{file}'  
        company = file.split('_')[0]
        data_dict["monthly"].append(pd.read_csv(gcs_path).assign(company=company))
    
    monthly_data = pd.concat(data_dict["monthly"], ignore_index=True)
    # daily_data['Date'] = pd.to_datetime(daily_data['Date'])
    # daily_data['Volumn'] = daily_data['Volume'].astype('Float64')
    column_mapping = {
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Adj Close': 'adj_close',
    'Volume': 'volumn',
    'Company': 'company'
    }
    # Map DataFrame columns to PostgreSQL columns
    df_monthly_mapped = monthly_data.rename(columns=column_mapping)
    connect_db_bulk_parallel(df_monthly_mapped)
    return df_monthly_mapped.head()

print(list_cs_files('dwdi-de-project','datasets'))

         date      open      high       low     close  adj_close      volumn  \
0  1997-06-01  0.075521  0.085417  0.068750  0.077083   0.077083  1063272000   
1  1997-07-01  0.077083  0.128646  0.075521  0.119792   0.119792  2172288000   
2  1997-08-01  0.117188  0.120833  0.096875  0.116927   0.116927   638136000   
3  1997-09-01  0.117188  0.240625  0.115625  0.216927   0.216927  2157240000   
4  1997-10-01  0.221875  0.275000  0.176042  0.254167   0.254167  2103744000   

  company  
0  AMAZON  
1  AMAZON  
2  AMAZON  
3  AMAZON  
4  AMAZON  


In [30]:
# Weekly Data Ingestion

from google.cloud import storage
import os
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor
from psycopg2.extensions import register_adapter, AsIs
register_adapter(np.int64, AsIs)

# set key credentials file path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'C:/Disk_D/Course Work/Data Warehousing/Project-2/Keys/alien-grove-405422-bb57fda72219.json'

def connect_db_bulk_parallel(df):
    # Split the DataFrame into chunks for parallel processing
    chunk_size = 5000  # Adjust the chunk size as needed
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Create a ThreadPoolExecutor with the desired number of threads
    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system capacity
        # Submit each chunk for parallel processing
        futures = [executor.submit(connect_db_bulk, chunk,'weekly') for chunk in chunks]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

def list_cs_files(bucket_name,prefix,delimiter=None): 
    storage_client = storage.Client()
    data_dict = {"daily": [], "monthly": [], "weekly": []}
    file_list = storage_client.list_blobs(bucket_name,prefix=prefix,delimiter=delimiter)
    file_list = [file.name for file in file_list]
    new_list = [s.split('/')[-1] for s in file_list]
    weekly_files = [file for file in new_list if 'weekly' in file]
    for file in weekly_files:
        gcs_path = f'gs://{bucket_name}/{prefix}/{file}'  
        company = file.split('_')[0]
        data_dict["weekly"].append(pd.read_csv(gcs_path).assign(company=company))
    
    weekly_data = pd.concat(data_dict["weekly"], ignore_index=True)
    column_mapping = {
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Adj Close': 'adj_close',
    'Volume': 'volumn',
    'Company': 'company'
    }
    # Map DataFrame columns to PostgreSQL columns
    df_weekly_mapped = weekly_data.rename(columns=column_mapping)
    columns = ['open','high','low','close','adj_close']
    connect_db_bulk_parallel(df_weekly_mapped)
    return df_weekly_mapped.head()

print(list_cs_files('dwdi-de-project','datasets'))

         date      open      high       low     close  adj_close      volumn  \
0  1997-05-12  0.121875  0.125000  0.085417  0.086458   0.086458  1737120000   
1  1997-05-19  0.088021  0.088542  0.065625  0.075000   0.075000  1162824000   
2  1997-05-26  0.075521  0.082292  0.072917  0.075000   0.075000   386784000   
3  1997-06-02  0.075521  0.085417  0.068750  0.082813   0.082813   366696000   
4  1997-06-09  0.082813  0.085417  0.076563  0.079167   0.079167   226488000   

  company  
0  AMAZON  
1  AMAZON  
2  AMAZON  
3  AMAZON  
4  AMAZON  


In [34]:
# Daily Data Ingestion

from google.cloud import storage
import os
import pandas as pd
import os
import psycopg2
from concurrent.futures import ThreadPoolExecutor
from psycopg2.extensions import register_adapter, AsIs
from dotenv import load_dotenv
from urllib.parse import quote_plus

load_dotenv()
register_adapter(np.int64, AsIs)


# set key credentials file path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'C:/Disk_D/Course Work/Data Warehousing/Project-2/Keys/alien-grove-405422-bb57fda72219.json'

def connect_db_bulk(df,tbname):
    # Access database details
    db_host = os.getenv("DB_HOST")
    db_port = os.getenv("DB_PORT")
    db_name = os.getenv("DB_NAME")
    db_user = os.getenv("DB_USER")
    db_password = os.getenv("DB_PASSWORD")

    # Use the variables in your database connection logic
    connection_params = {
        'host': db_host,
        'port': db_port,
        'database': db_name,
        'user': db_user,
        'password': db_password,
    }

    # Establish a connection to your PostgreSQL database
    connection = psycopg2.connect(**connection_params)

    # Create a cursor to execute SQL statements
    cursor = connection.cursor()
    table_name = tbname
    schema_name = 'public'

    insert_query = f"INSERT INTO {schema_name}.{table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"

    records = df.to_records(index=False)
    values = [tuple(record) for record in records]

    cursor.executemany(insert_query, values)

    connection.commit()
    cursor.close()
    connection.close()

def connect_db_bulk_parallel(df):
    # Split the DataFrame into chunks for parallel processing
    chunk_size = 5000  # Adjust the chunk size as needed
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Create a ThreadPoolExecutor with the desired number of threads
    with ThreadPoolExecutor(max_workers=6) as executor:  # Adjust max_workers based on your system capacity
        # Submit each chunk for parallel processing
        futures = [executor.submit(connect_db_bulk, chunk,'daily') for chunk in chunks]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

def list_cs_files(bucket_name,prefix,delimiter=None): 
    storage_client = storage.Client()
    data_dict = {"daily": [], "monthly": [], "weekly": []}
    file_list = storage_client.list_blobs(bucket_name,prefix=prefix,delimiter=delimiter)
    file_list = [file.name for file in file_list]
    new_list = [s.split('/')[-1] for s in file_list]
    daily_files = [file for file in new_list if 'daily' in file]
    for file in daily_files:
        gcs_path = f'gs://{bucket_name}/{prefix}/{file}'  
        company = file.split('_')[0]
        data_dict["daily"].append(pd.read_csv(gcs_path).assign(company=company))
    
    daily_data = pd.concat(data_dict["daily"], ignore_index=True)
    column_mapping = {
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Adj Close': 'adj_close',
    'Volume': 'volumn',
    'Company': 'company'
    }
    # Map DataFrame columns to PostgreSQL columns
    df_daily_mapped = daily_data.rename(columns=column_mapping)
    connect_db_bulk_parallel(df_daily_mapped)
    return df_daily_mapped.head()

print(list_cs_files('dwdi-de-project','datasets'))

         date      open      high       low     close  adj_close      volumn  \
0  1997-05-15  0.121875  0.125000  0.096354  0.097917   0.097917  1443120000   
1  1997-05-16  0.098438  0.098958  0.085417  0.086458   0.086458   294000000   
2  1997-05-19  0.088021  0.088542  0.081250  0.085417   0.085417   122136000   
3  1997-05-20  0.086458  0.087500  0.081771  0.081771   0.081771   109344000   
4  1997-05-21  0.081771  0.082292  0.068750  0.071354   0.071354   377064000   

  company  
0  AMAZON  
1  AMAZON  
2  AMAZON  
3  AMAZON  
4  AMAZON  


In [49]:
# Monthly Data Ingestion check if data is already exists in the database

from google.cloud import storage
import os
import pandas as pd
import os
import psycopg2
from concurrent.futures import ThreadPoolExecutor
from psycopg2.extensions import register_adapter, AsIs
register_adapter(np.int64, AsIs)

# set key credentials file path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'C:/Disk_D/Course Work/Data Warehousing/Project-2/Keys/alien-grove-405422-bb57fda72219.json'

def connect_db_bulk(df,tbname):
    #Get Credentials
    db_host = os.getenv("DB_HOST")
    db_port = os.getenv("DB_PORT")
    db_name = os.getenv("DB_NAME")
    db_user = os.getenv("DB_USER")
    db_password = os.getenv("DB_PASSWORD")

    # Use the variables in your database connection logic
    connection_params = {
        'host': db_host,
        'port': db_port,
        'database': db_name,
        'user': db_user,
        'password': db_password,
    }
    # Establish a connection to your PostgreSQL database
    connection = psycopg2.connect(**connection_params)

    # Create a cursor to execute SQL statements
    cursor = connection.cursor()
    table_name = tbname
    schema_name = 'public'
    df['date'] = pd.to_datetime(df['date'])
    max_dates_by_company = df.groupby('company')['date'].max()
    # max_dates_by_company_df = max_dates_by_company.reset_index()
    max_dates_by_company = max_dates_by_company.reset_index().sort_values(by='company')
    sql_query = "SELECT company, MAX(date) as date FROM monthly GROUP BY company ORDER BY company;"
    # Execute the query and store the result in a DataFrame
    db_result = pd.read_sql_query(sql_query, connection)
    non_existence_check = ~df[
        df[['date', 'company']].apply(
            lambda row: (row['date'], row['company']) in db_result[['date', 'company']].values.tolist(),
            axis=1
        )
    ]
    non_existence_check_df = pd.DataFrame(non_existence_check, columns=df.columns)
    if not non_existence_check_df.empty:
        updated_df = pd.DataFrame()
        updated_df = updated_df.assign(result = df.date.isin(non_existence_check_df.date))
        insert_query = f"INSERT INTO {schema_name}.{table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
        records = updated_df.to_records(index=False)
        values = [tuple(record) for record in records]
        cursor.executemany(insert_query, values)

    connection.commit()
    cursor.close()
    connection.close()

def connect_db_bulk_parallel(df):
    # Split the DataFrame into chunks for parallel processing
    chunk_size = 5000  # Adjust the chunk size as needed
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Create a ThreadPoolExecutor with the desired number of threads
    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system capacity
        # Submit each chunk for parallel processing
        futures = [executor.submit(connect_db_bulk, chunk,'monthly') for chunk in chunks]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

def list_cs_files(bucket_name,prefix,delimiter=None): 
    storage_client = storage.Client()
    data_dict = {"daily": [], "monthly": [], "weekly": []}
    file_list = storage_client.list_blobs(bucket_name,prefix=prefix,delimiter=delimiter)
    file_list = [file.name for file in file_list]
    new_list = [s.split('/')[-1] for s in file_list]
    monthly_files = [file for file in new_list if 'monthly' in file]
    for file in monthly_files:
        gcs_path = f'gs://{bucket_name}/{prefix}/{file}'  
        company = file.split('_')[0]
        data_dict["monthly"].append(pd.read_csv(gcs_path).assign(company=company))
    
    monthly_data = pd.concat(data_dict["monthly"], ignore_index=True)
    # daily_data['Date'] = pd.to_datetime(daily_data['Date'])
    # daily_data['Volumn'] = daily_data['Volume'].astype('Float64')
    column_mapping = {
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Adj Close': 'adj_close',
    'Volume': 'volumn',
    'Company': 'company'
    }
    # Map DataFrame columns to PostgreSQL columns
    df_monthly_mapped = monthly_data.rename(columns=column_mapping)
    connect_db_bulk_parallel(df_monthly_mapped)
    return df_monthly_mapped.head()

print(list_cs_files('dwdi-de-project','datasets'))

  db_result = pd.read_sql_query(sql_query, connection)


         date      open      high       low     close  adj_close      volumn  \
0  1997-06-01  0.075521  0.085417  0.068750  0.077083   0.077083  1063272000   
1  1997-07-01  0.077083  0.128646  0.075521  0.119792   0.119792  2172288000   
2  1997-08-01  0.117188  0.120833  0.096875  0.116927   0.116927   638136000   
3  1997-09-01  0.117188  0.240625  0.115625  0.216927   0.216927  2157240000   
4  1997-10-01  0.221875  0.275000  0.176042  0.254167   0.254167  2103744000   

  company  
0  AMAZON  
1  AMAZON  
2  AMAZON  
3  AMAZON  
4  AMAZON  


In [50]:
# Weekly Data Ingestion check if data is already exists in the database

from google.cloud import storage
import os
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor
from psycopg2.extensions import register_adapter, AsIs
register_adapter(np.int64, AsIs)

# set key credentials file path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'C:/Disk_D/Course Work/Data Warehousing/Project-2/Keys/alien-grove-405422-bb57fda72219.json'

def connect_db_bulk(df,tbname):
    #Get Credentials
    db_host = os.getenv("DB_HOST")
    db_port = os.getenv("DB_PORT")
    db_name = os.getenv("DB_NAME")
    db_user = os.getenv("DB_USER")
    db_password = os.getenv("DB_PASSWORD")

    # Use the variables in your database connection logic
    connection_params = {
        'host': db_host,
        'port': db_port,
        'database': db_name,
        'user': db_user,
        'password': db_password,
    }
    # Establish a connection to your PostgreSQL database
    connection = psycopg2.connect(**connection_params)

    # Create a cursor to execute SQL statements
    cursor = connection.cursor()
    table_name = tbname
    schema_name = 'public'
    df['date'] = pd.to_datetime(df['date'])
    max_dates_by_company = df.groupby('company')['date'].max()
    # max_dates_by_company_df = max_dates_by_company.reset_index()
    max_dates_by_company = max_dates_by_company.reset_index().sort_values(by='company')
    sql_query = "SELECT company, MAX(date) as date FROM weekly GROUP BY company ORDER BY company;"
    # Execute the query and store the result in a DataFrame
    db_result = pd.read_sql_query(sql_query, connection)
    non_existence_check = ~df[
        df[['date', 'company']].apply(
            lambda row: (row['date'], row['company']) in db_result[['date', 'company']].values.tolist(),
            axis=1
        )
    ]
    non_existence_check_df = pd.DataFrame(non_existence_check, columns=df.columns)
    if not non_existence_check_df.empty:
        updated_df = pd.DataFrame()
        updated_df = updated_df.assign(result = df.date.isin(non_existence_check_df.date))
        insert_query = f"INSERT INTO {schema_name}.{table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
        records = updated_df.to_records(index=False)
        values = [tuple(record) for record in records]
        cursor.executemany(insert_query, values)

    connection.commit()
    cursor.close()
    connection.close()

def connect_db_bulk_parallel(df):
    # Split the DataFrame into chunks for parallel processing
    chunk_size = 5000  # Adjust the chunk size as needed
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Create a ThreadPoolExecutor with the desired number of threads
    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system capacity
        # Submit each chunk for parallel processing
        futures = [executor.submit(connect_db_bulk, chunk,'weekly') for chunk in chunks]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

def list_cs_files(bucket_name,prefix,delimiter=None): 
    storage_client = storage.Client()
    data_dict = {"daily": [], "monthly": [], "weekly": []}
    file_list = storage_client.list_blobs(bucket_name,prefix=prefix,delimiter=delimiter)
    file_list = [file.name for file in file_list]
    new_list = [s.split('/')[-1] for s in file_list]
    weekly_files = [file for file in new_list if 'weekly' in file]
    for file in weekly_files:
        gcs_path = f'gs://{bucket_name}/{prefix}/{file}'  
        company = file.split('_')[0]
        data_dict["weekly"].append(pd.read_csv(gcs_path).assign(company=company))
    
    weekly_data = pd.concat(data_dict["weekly"], ignore_index=True)
    column_mapping = {
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Adj Close': 'adj_close',
    'Volume': 'volumn',
    'Company': 'company'
    }
    # Map DataFrame columns to PostgreSQL columns
    df_weekly_mapped = weekly_data.rename(columns=column_mapping)
    columns = ['open','high','low','close','adj_close']
    connect_db_bulk_parallel(df_weekly_mapped)
    return df_weekly_mapped.head()

print(list_cs_files('dwdi-de-project','datasets'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'])
  db_result = pd.read_sql_query(sql_query, connection)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'])
  db_result = pd.read_sql_query(sql_query, connection)


         date      open      high       low     close  adj_close      volumn  \
0  1997-05-12  0.121875  0.125000  0.085417  0.086458   0.086458  1737120000   
1  1997-05-19  0.088021  0.088542  0.065625  0.075000   0.075000  1162824000   
2  1997-05-26  0.075521  0.082292  0.072917  0.075000   0.075000   386784000   
3  1997-06-02  0.075521  0.085417  0.068750  0.082813   0.082813   366696000   
4  1997-06-09  0.082813  0.085417  0.076563  0.079167   0.079167   226488000   

  company  
0  AMAZON  
1  AMAZON  
2  AMAZON  
3  AMAZON  
4  AMAZON  


In [51]:
# Daily Data Ingestion if data exists in the database

from google.cloud import storage
import os
import pandas as pd
import os
import psycopg2
from concurrent.futures import ThreadPoolExecutor
from psycopg2.extensions import register_adapter, AsIs
from dotenv import load_dotenv
from urllib.parse import quote_plus

load_dotenv()
register_adapter(np.int64, AsIs)


# set key credentials file path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'C:/Disk_D/Course Work/Data Warehousing/Project-2/Keys/alien-grove-405422-bb57fda72219.json'

def connect_db_bulk(df,tbname):
    # Access database details
    db_host = os.getenv("DB_HOST")
    db_port = os.getenv("DB_PORT")
    db_name = os.getenv("DB_NAME")
    db_user = os.getenv("DB_USER")
    db_password = os.getenv("DB_PASSWORD")

    # Use the variables in your database connection logic
    connection_params = {
        'host': db_host,
        'port': db_port,
        'database': db_name,
        'user': db_user,
        'password': db_password,
    }

    # Establish a connection to your PostgreSQL database
    connection = psycopg2.connect(**connection_params)

    # Create a cursor to execute SQL statements
    cursor = connection.cursor()
    table_name = tbname
    schema_name = 'public'
    df['date'] = pd.to_datetime(df['date'])
    max_dates_by_company = df.groupby('company')['date'].max()
    # max_dates_by_company_df = max_dates_by_company.reset_index()
    max_dates_by_company = max_dates_by_company.reset_index().sort_values(by='company')
    sql_query = "SELECT company, MAX(date) as date FROM daily GROUP BY company ORDER BY company;"
    # Execute the query and store the result in a DataFrame
    db_result = pd.read_sql_query(sql_query, connection)
    non_existence_check = ~df[
        df[['date', 'company']].apply(
            lambda row: (row['date'], row['company']) in db_result[['date', 'company']].values.tolist(),
            axis=1
        )
    ]
    non_existence_check_df = pd.DataFrame(non_existence_check, columns=df.columns)
    if not non_existence_check_df.empty:
        updated_df = pd.DataFrame()
        updated_df = updated_df.assign(result = df.date.isin(non_existence_check_df.date))
        insert_query = f"INSERT INTO {schema_name}.{table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
        records = updated_df.to_records(index=False)
        values = [tuple(record) for record in records]
        cursor.executemany(insert_query, values)

    connection.commit()
    cursor.close()
    connection.close()

def connect_db_bulk_parallel(df):
    # Split the DataFrame into chunks for parallel processing
    chunk_size = 5000  # Adjust the chunk size as needed
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Create a ThreadPoolExecutor with the desired number of threads
    with ThreadPoolExecutor(max_workers=6) as executor:  # Adjust max_workers based on your system capacity
        # Submit each chunk for parallel processing
        futures = [executor.submit(connect_db_bulk, chunk,'daily') for chunk in chunks]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

def list_cs_files(bucket_name,prefix,delimiter=None): 
    storage_client = storage.Client()
    data_dict = {"daily": [], "monthly": [], "weekly": []}
    file_list = storage_client.list_blobs(bucket_name,prefix=prefix,delimiter=delimiter)
    file_list = [file.name for file in file_list]
    new_list = [s.split('/')[-1] for s in file_list]
    daily_files = [file for file in new_list if 'daily' in file]
    for file in daily_files:
        gcs_path = f'gs://{bucket_name}/{prefix}/{file}'  
        company = file.split('_')[0]
        data_dict["daily"].append(pd.read_csv(gcs_path).assign(company=company))
    
    daily_data = pd.concat(data_dict["daily"], ignore_index=True)
    column_mapping = {
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Adj Close': 'adj_close',
    'Volume': 'volumn',
    'Company': 'company'
    }
    # Map DataFrame columns to PostgreSQL columns
    df_daily_mapped = daily_data.rename(columns=column_mapping)
    connect_db_bulk_parallel(df_daily_mapped)
    return df_daily_mapped.head()

print(list_cs_files('dwdi-de-project','datasets'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'])
  db_result = pd.read_sql_query(sql_query, connection)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'])
  db_result = pd.read_sql_query(sql_query, connection)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd

         date      open      high       low     close  adj_close      volumn  \
0  1997-05-15  0.121875  0.125000  0.096354  0.097917   0.097917  1443120000   
1  1997-05-16  0.098438  0.098958  0.085417  0.086458   0.086458   294000000   
2  1997-05-19  0.088021  0.088542  0.081250  0.085417   0.085417   122136000   
3  1997-05-20  0.086458  0.087500  0.081771  0.081771   0.081771   109344000   
4  1997-05-21  0.081771  0.082292  0.068750  0.071354   0.071354   377064000   

  company  
0  AMAZON  
1  AMAZON  
2  AMAZON  
3  AMAZON  
4  AMAZON  


In [125]:
# Monthly Data Ingestion check if data is already exists in the database
from google.cloud import storage
import os
import pandas as pd
import os
import psycopg2
from concurrent.futures import ThreadPoolExecutor
from psycopg2.extensions import register_adapter, AsIs
register_adapter(np.int64, AsIs)

# set key credentials file path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'C:/Disk_D/Course Work/Data Warehousing/Project-2/Keys/alien-grove-405422-bb57fda72219.json'

def connect_db_bulk(df,tbname):
    #Get Credentials
    db_host = os.getenv("DB_HOST")
    db_port = os.getenv("DB_PORT")
    db_name = os.getenv("DB_NAME")
    db_user = os.getenv("DB_USER")
    db_password = os.getenv("DB_PASSWORD")

    # Use the variables in your database connection logic
    connection_params = {
        'host': db_host,
        'port': db_port,
        'database': db_name,
        'user': db_user,
        'password': db_password,
    }
    # Establish a connection to your PostgreSQL database
    connection = psycopg2.connect(**connection_params)

    # Create a cursor to execute SQL statements
    cursor = connection.cursor()
    table_name = tbname
    schema_name = 'public'
    df['date'] = pd.to_datetime(df['date']).dt.date
    max_dates_by_company = df.groupby('company')['date'].max()
    # max_dates_by_company_df = max_dates_by_company.reset_index()
    max_dates_by_company = max_dates_by_company.reset_index().sort_values(by='company')
    # print(max_dates_by_company)
    sql_query = "SELECT company, MAX(date) as date FROM public.testmonthly GROUP BY company ORDER BY company;"
    # Execute the query and store the result in a DataFrame
    db_result = pd.read_sql_query(sql_query, connection)
    db_result['date'] = pd.to_datetime(db_result['date']).dt.date
    res_cd = pd.DataFrame()
    if len(db_result) == 0:
        # res_cd = max_dates_by_company
        insert_query = f"INSERT INTO {schema_name}.{table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
        records = df.to_records(index=False)
        values = [tuple(record) for record in records]
        cursor.executemany(insert_query, values)
    else:
        db_result['date'] = pd.to_datetime(db_result['date']).dt.date
        for i in range(len(db_result)):
            if db_result.loc[i, "date"] < max_dates_by_company.loc[i, "date"]:
                res_cd = res_cd.append(db_result.loc[i])
        # res_cd['date'] = pd.to_datetime(res_cd['date']).dt.date
        res = pd.DataFrame()
        for i in range(len(res_cd)):
            company_res = df[(df['company'] == res_cd.loc[i, 'company']) & (df['date'] > res_cd.loc[i, 'date'])]
            # print("company_res",company_res)
            res = res.append(company_res)
        
        print("result: \n",res)
        if(len(res)!=0):
            insert_query = f"INSERT INTO {schema_name}.{table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
            records = res.to_records(index=False)
            values = [tuple(record) for record in records]
            cursor.executemany(insert_query, values)

    connection.commit()
    cursor.close()
    connection.close()

def connect_db_bulk_parallel(df):
    # Split the DataFrame into chunks for parallel processing
    chunk_size = 5000  # Adjust the chunk size as needed
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Create a ThreadPoolExecutor with the desired number of threads
    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system capacity
        # Submit each chunk for parallel processing
        futures = [executor.submit(connect_db_bulk, chunk,'testmonthly') for chunk in chunks]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

def list_cs_files(bucket_name,prefix,delimiter=None): 
    storage_client = storage.Client()
    data_dict = {"daily": [], "monthly": [], "weekly": []}
    file_list = storage_client.list_blobs(bucket_name,prefix=prefix,delimiter=delimiter)
    file_list = [file.name for file in file_list]
    new_list = [s.split('/')[-1] for s in file_list]
    monthly_files = [file for file in new_list if 'monthly' in file]
    for file in monthly_files:
        gcs_path = f'gs://{bucket_name}/{prefix}/{file}'  
        company = file.split('_')[0]
        data_dict["monthly"].append(pd.read_csv(gcs_path).assign(company=company))
    
    monthly_data = pd.concat(data_dict["monthly"], ignore_index=True)
    # daily_data['Date'] = pd.to_datetime(daily_data['Date'])
    # daily_data['Volumn'] = daily_data['Volume'].astype('Float64')
    column_mapping = {
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Adj Close': 'adj_close',
    'Volume': 'volumn',
    'Company': 'company'
    }
    # Map DataFrame columns to PostgreSQL columns
    df_monthly_mapped = monthly_data.rename(columns=column_mapping)
    connect_db_bulk_parallel(df_monthly_mapped)
    return df_monthly_mapped.head()

print(list_cs_files('dwdi-de-project','datasets'))

result: 
 Empty DataFrame
Columns: []
Index: []
         date      open      high       low     close  adj_close      volumn  \
0  1997-06-01  0.075521  0.085417  0.068750  0.077083   0.077083  1063272000   
1  1997-07-01  0.077083  0.128646  0.075521  0.119792   0.119792  2172288000   
2  1997-08-01  0.117188  0.120833  0.096875  0.116927   0.116927   638136000   
3  1997-09-01  0.117188  0.240625  0.115625  0.216927   0.216927  2157240000   
4  1997-10-01  0.221875  0.275000  0.176042  0.254167   0.254167  2103744000   

  company  
0  AMAZON  
1  AMAZON  
2  AMAZON  
3  AMAZON  
4  AMAZON  


  db_result = pd.read_sql_query(sql_query, connection)


In [63]:
!pip install kaggle --upgrade



In [127]:
# Download the dataset
from kaggle.api.kaggle_api_extended import KaggleApi
import shutil
# Set your Kaggle API key
api = KaggleApi()
api.authenticate()
api.dataset_download_files(dataset="nikhil1e9/netflix-stock-price", path="datasets/", unzip=True)

def connect_db_bulk(df,tbname):
    #Get Credentials
    db_host = os.getenv("DB_HOST")
    db_port = os.getenv("DB_PORT")
    db_name = os.getenv("DB_NAME")
    db_user = os.getenv("DB_USER")
    db_password = os.getenv("DB_PASSWORD")

    # Use the variables in your database connection logic
    connection_params = {
        'host': db_host,
        'port': db_port,
        'database': db_name,
        'user': db_user,
        'password': db_password,
    }
    # Establish a connection to your PostgreSQL database
    connection = psycopg2.connect(**connection_params)

    # Create a cursor to execute SQL statements
    cursor = connection.cursor()
    table_name = tbname
    schema_name = 'public'
    df['date'] = pd.to_datetime(df['date']).dt.date
    max_dates_by_company = df.groupby('company')['date'].max()
    # max_dates_by_company_df = max_dates_by_company.reset_index()
    max_dates_by_company = max_dates_by_company.reset_index().sort_values(by='company')
    # print(max_dates_by_company)
    sql_query = "SELECT company, MAX(date) as date FROM public.testmonthly GROUP BY company ORDER BY company;"
    # Execute the query and store the result in a DataFrame
    db_result = pd.read_sql_query(sql_query, connection)
    db_result['date'] = pd.to_datetime(db_result['date']).dt.date
    res_cd = pd.DataFrame()
    if len(db_result) == 0:
        # res_cd = max_dates_by_company
        insert_query = f"INSERT INTO {schema_name}.{table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
        records = df.to_records(index=False)
        values = [tuple(record) for record in records]
        cursor.executemany(insert_query, values)
    else:
        db_result['date'] = pd.to_datetime(db_result['date']).dt.date
        for i in range(len(db_result)):
            if db_result.loc[i, "date"] < max_dates_by_company.loc[i, "date"]:
                res_cd = res_cd.append(db_result.loc[i])
        # res_cd['date'] = pd.to_datetime(res_cd['date']).dt.date
        res = pd.DataFrame()
        for i in range(len(res_cd)):
            company_res = df[(df['company'] == res_cd.loc[i, 'company']) & (df['date'] > res_cd.loc[i, 'date'])]
            # print("company_res",company_res)
            res = res.append(company_res)
        
        # print("result: \n",res)
        if(len(res)!=0):
            insert_query = f"INSERT INTO {schema_name}.{table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
            records = res.to_records(index=False)
            values = [tuple(record) for record in records]
            cursor.executemany(insert_query, values)

    connection.commit()
    cursor.close()
    connection.close()

def connect_db_bulk_parallel(df):
    # Split the DataFrame into chunks for parallel processing
    chunk_size = 5000  # Adjust the chunk size as needed
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Create a ThreadPoolExecutor with the desired number of threads
    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system capacity
        # Submit each chunk for parallel processing
        futures = [executor.submit(connect_db_bulk, chunk,'testmonthly') for chunk in chunks]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

def list_csv_files(): 
    downloaded_files = os.listdir('datasets')
    data_dict = {"daily": [], "monthly": [], "weekly": []}
    file_list = [file for file in downloaded_files]
    new_list = [s.split('/')[-1] for s in file_list]
    monthly_files = [file for file in new_list if 'monthly' in file]
    for file in monthly_files:
        gcs_path = 'datasets/'  
        company = file.split('_')[0]
        data_dict["monthly"].append(pd.read_csv(gcs_path+file).assign(company=company))
    monthly_data = pd.concat(data_dict["monthly"], ignore_index=True)

    # for file_name in file_list:
    #     file_path = os.path.join(os.listdir('datasets'), file_name)
    directory_path = 'datasets'
    shutil.rmtree(directory_path)
    
    column_mapping = {
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Adj Close': 'adj_close',
    'Volume': 'volumn',
    'Company': 'company'
    }
    # Map DataFrame columns to PostgreSQL columns
    df_monthly_mapped = monthly_data.rename(columns=column_mapping)
    connect_db_bulk_parallel(df_monthly_mapped)
    return df_monthly_mapped.head()

print(list_csv_files())


  db_result = pd.read_sql_query(sql_query, connection)
  res_cd = res_cd.append(db_result.loc[i])
  res_cd = res_cd.append(db_result.loc[i])
  res_cd = res_cd.append(db_result.loc[i])
  res_cd = res_cd.append(db_result.loc[i])
  res_cd = res_cd.append(db_result.loc[i])
  res = res.append(company_res)
  res = res.append(company_res)
  res = res.append(company_res)
  res = res.append(company_res)
  res = res.append(company_res)


result: 
             date        open        high         low       close   adj_close  \
316   2023-10-01  127.279999  134.479996  118.349998  133.089996  133.089996   
317   2023-11-01  133.960007  149.259995  133.710007  147.729996  147.729996   
783   2023-10-01  171.220001  182.339996  165.669998  170.770004  170.545319   
784   2023-11-01  171.000000  192.929993  170.119995  189.789993  189.540283   
1014  2023-10-01  132.154999  142.380005  121.459999  125.300003  125.300003   
1015  2023-11-01  125.339996  141.100006  124.925003  138.050003  138.050003   
1152  2023-10-01  302.739990  330.540009  279.399994  301.269989  301.269989   
1153  2023-11-01  301.850006  342.920013  301.850006  334.700012  334.700012   
1410  2023-10-01  377.480011  418.839996  344.730011  411.690002  411.690002   
1411  2023-11-01  414.769989  482.700012  414.179993  479.170013  479.170013   

          volumn  company  
316   1224564700   AMAZON  
317    876754600   AMAZON  
783   1172719600    APPLE

In [130]:
!pip install --upgrade pandas

Collecting pandas
  Downloading pandas-2.1.3-cp39-cp39-win_amd64.whl (10.8 MB)
     --------------------------------------- 10.8/10.8 MB 24.2 MB/s eta 0:00:00
Collecting tzdata>=2022.1
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
     ------------------------------------- 341.8/341.8 kB 20.7 MB/s eta 0:00:00
Installing collected packages: tzdata, pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Uninstalling pandas-1.5.3:
      Successfully uninstalled pandas-1.5.3


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\18572\\anaconda3\\Lib\\site-packages\\~andas\\_libs\\algos.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.

