### You have a Python script that processes millions of records in a single thread. How would you optimize it to leverage multiple cores and reduce the execution time? Provide a sample code snippet

In [1]:
import pandas as pd
import numpy as np
import datetime
import time
import multiprocessing as mp

In [2]:
x = datetime.datetime(1993, 12, 9, 0, 0)

simulated_dates = []

for i in range(2000000):
    x += datetime.timedelta(minutes=60)
    simulated_dates.append(x)

df = pd.DataFrame(simulated_dates, columns=['Dates'])

print(df.shape)

(2000000, 1)


In [3]:
# Function to add an hour to a given chunk of data
def add_hour(dates):
    return dates + pd.Timedelta(hours=1)

# Without multiprocessing
start_time = time.time()
df['Dates'] = df['Dates'].apply(add_hour)
end_time = time.time()

print("Without multiprocessing:", end_time - start_time, "seconds")

Without multiprocessing: 16.446693420410156 seconds


In [4]:
# With multiprocessing
def process_chunk(chunk):
    return add_hour(chunk)

start_time = time.time()
num_cores = mp.cpu_count()
chunk_size = len(df) // num_cores
chunks = [df['Dates'][i:i+chunk_size] for i in range(0, len(df), chunk_size)]

with mp.Pool(processes=num_cores) as pool:
    df['Dates'] = pd.concat(pool.map(process_chunk, chunks))
end_time = time.time()

print("With multiprocessing:", end_time - start_time, "seconds")


With multiprocessing: 0.24020624160766602 seconds


###  Write a Python script using the Azure SDK that uploads a file to an Azure Blob Storage container. Ensure the script checks if the container exists and creates it if it does not.

In [5]:
from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceExistsError
from azure.core.exceptions import ResourceExistsError
from dotenv import load_dotenv
import os

load_dotenv('../.env')

# Azure Storage account details
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
container_name = "container"
local_file_path = "/home/saim/dtu_test_case/test.txt"
blob_name = "test_blob.txt"


blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

try:
    container_client.create_container()
    print(f"Container '{container_name}' created successfully.")
except ResourceExistsError:
    print(f"Container '{container_name}' already exists.")

blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)

with open(local_file_path, "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

print(f"File '{local_file_path}' uploaded to Azure Blob Storage as '{blob_name}' in container '{container_name}'.")


Container 'container' already exists.
File '/home/saim/dtu_test_case/test.txt' uploaded to Azure Blob Storage as 'test_blob.txt' in container 'container'.


### Write a Python script to download logs from Azure (e.g. events from a specific resource)

In [7]:
import zipfile
import subprocess

# Azure subscription and resource details
resource_group_name = "Zoomcamp"
app_name = "my-app"

# Download the logs using Azure CLI
command = f"az webapp log download --resource-group {resource_group_name} --name {app_name} --log-file azure_webapp_logs.zip"

subprocess.run(command, shell=True, check=True)

with zipfile.ZipFile('azure_webapp_logs.zip', 'r') as zip_ref:
    zip_ref.extractall('logs')

print("Logs have been downloaded and extracted.")




Logs have been downloaded and extracted.
