In [31]:
import pandas as pd
import zipfile
import os
import shutil
from io import StringIO
import requests
import json
import time
import glob
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

In [32]:
# URL of the zip file
zip_url = "https://www.fns.usda.gov/sites/default/files/resource-files/historical-snap-retailer-locator-data-2023.12.31.zip"

# Download the zip file
response = requests.get(zip_url, timeout=30)

# Read the zip file contents
dfs = []  # List to store DataFrames from individual CSV files

with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
    # Iterate through each file in the zip archive
    for filename in zf.namelist():
        if filename.lower().endswith('.csv'):
            # Read the CSV file into a Pandas DataFrame
            with zf.open(filename) as f:
                df = pd.read_csv(f)
                dfs.append(df)

# Combine all DataFrames into a single DataFrame
raw_df = pd.concat(dfs, ignore_index=True)

# # Save the combined DataFrame to a new CSV file
# raw_df.to_csv('1_raw_snapdata.csv', index=False, mode='w')
# print(combined_df.head())
# print(combined_df.info())

In [33]:
# # Load the combined DataFrame from the CSV file
# raw_df = pd.read_csv('1_raw_snapdata.csv')

# Columns where leading/trailing spaces should be removed
columns_to_strip = ['Store Name', 'Store Type', 'Street Number', 'Street Name', 
                    'Additional Address', 'City', 'State', 'Zip4', 'County']

# Apply strip() to remove leading and trailing spaces in specified columns
raw_df[columns_to_strip] = raw_df[columns_to_strip].apply(lambda x: x.str.strip())

# Filter rows based on 'Street Number' column
raw_df['Street Number'] = raw_df['Street Number'].str.replace(r'\D+', '', regex=True)  # Keep only digits

# Convert 'Street Number' column to numeric type, errors='coerce' will convert non-numeric values to NaN
raw_df['Street Number'] = pd.to_numeric(raw_df['Street Number'], errors='coerce')

# Drop rows where 'Street Number' is not a valid integer
raw_df = raw_df.dropna(subset=['Street Number'], axis=0)

# Define the list of columns to check for null values
required_columns = ['Street Number', 'Street Name', 'City', 'State', 'Zip Code']

# Drop rows where the required columns contain null values
raw_df = raw_df.dropna(subset=required_columns, thresh=5)

# # Save the cleaned DataFrame to a new CSV file, replacing if already exists
# raw_df.to_csv("2_raw_clean_null.csv", index=False, mode='w')
# print(raw_df.info())

In [34]:
# # Load your DataFrame from the "raw_cleaned.csv" file
# raw_df = pd.read_csv("2_raw_clean_null.csv")

# Convert column to datetime format
raw_df["Authorization Date"] = pd.to_datetime(raw_df["Authorization Date"], errors="coerce")
raw_df["End Date"] = pd.to_datetime(raw_df["End Date"], errors="coerce")

# Extract year from "Authorization Date" and assign it to "Authorization Year" column
raw_df["Authorization Year"] = raw_df["Authorization Date"].dt.year

# Create the "Address" column by concatenating address components
raw_df["Address"] = (
    raw_df["Street Number"].astype(str)
    + " "
    + raw_df["Street Name"]
    + " "
    + raw_df["Additional Address"].fillna("")
    + ", "
    + raw_df["City"]
    + ", "
    + raw_df["State"]
    + " "
    + raw_df["Zip Code"].astype(str)
)
# # Save the updated DataFrame back to CSV
# raw_df.to_csv("3_raw_updated.csv", index=False, mode='w')
# print(raw_df.info())

In [35]:
# Specify the path to your JSON configuration file
config_file_path = 'config.json'

with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

# Azure connection to container
CONNECTION_STRING_AZURE_STORAGE = config["connection_string"]
CONTAINER_AZURE = config["container_name"]

# Save DataFrame to CSV file and upload to Azure Blob Storage
csv_data = raw_df.to_csv(index=False)  # Convert DataFrame to CSV data (string)

blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING_AZURE_STORAGE)
container_client = blob_service_client.get_container_client(CONTAINER_AZURE)

blob_name = 'snap_retailer_data.csv'

# Upload CSV data directly to Azure Blob Storage
upload_azure = blob_service_client.get_blob_client(container=CONTAINER_AZURE, blob=blob_name)
upload_azure.upload_blob(csv_data, overwrite=True)  # Upload CSV data directly

# List all blobs in the specified container
blob_list = container_client.list_blobs()
for blob in blob_list:
    print(blob.name)
    blob_client = container_client.get_blob_client(blob=blob.name)
    blob_data = blob_client.download_blob()
    blob_content = blob_data.readall().decode('utf-8')
    df = pd.read_csv(StringIO(blob_content))
    
print(df.shape)


snap_retailer_data.csv


  df = pd.read_csv(StringIO(blob_content))


(952524, 17)


In [36]:
new_df = df.copy()
# new_df.head()

Unnamed: 0,Record ID,Store Name,Store Type,Street Number,Street Name,Additional Address,City,State,Zip Code,Zip4,County,Latitude,Longitude,Authorization Date,End Date,Authorization Year,Address
0,785740,ADAK GENERAL STORE,Convenience Store,42311.0,FINGER BAY RD,,ADAK,AK,99546,2036.0,ALEUTIANS EAST,51.84434,-176.6284,2006-05-08,2016-03-01,2006,"42311.0 FINGER BAY RD , ADAK, AK 99546"
1,512036,Alaska Commercial Company 325,Combination Grocery/Other,1.0,BAYVIEW DR,,KING COVE,AK,99612,,ALEUTIANS EAST,55.05876,-162.3134,1999-02-02,,1999,"1.0 BAYVIEW DR , KING COVE, AK 99612"
2,758776,Alaska Commercial Company 220,Combination Grocery/Other,100.0,Main St,,Sand Point,AK,99661,9800.0,ALEUTIANS EAST,55.33546,-160.4955,2005-07-05,,2005,"100.0 Main St , Sand Point, AK 99661"
3,672376,ALEUTIAN COMMERCIAL CO,Combination Grocery/Other,100.0,MAIN,,SAND POINT,AK,99661,,ALEUTIANS EAST,0.0,0.0,1973-01-26,1993-07-29,1973,"100.0 MAIN , SAND POINT, AK 99661"
4,309052,ALEUTIAN COMMERCIAL CO INC,Large Grocery Store,100.0,MAIN ST,,SAND POINT,AK,99661,,ALEUTIANS EAST,55.30257,-160.4045,1993-07-28,2005-07-11,1993,"100.0 MAIN ST , SAND POINT, AK 99661"
