# Jupyter notebook sample

In [37]:
#!/usr/bin/env python

# make sure to install these packages before running:
# pip install pandas
# pip install sodapy
# pip install python-dotenv
# pip install kagglehub

import xml.etree.ElementTree as ET
import pandas as pd
from sodapy import Socrata
from dotenv import load_dotenv
import os
import kagglehub
import ast



In [38]:
# Load environment variables from .env file
load_dotenv()

# Access the secrets
app_token = os.getenv("APP_TOKEN")
username = os.getenv("OPEN_DATA_NYC_USERNAME")
password = os.getenv("OPEN_DATA_NYC_PASSWORD")

In [39]:
def fetch_restaurant_data(app_token, username, password, dataset_id="pitm-atqc", limit=1000):
    """
    Fetch restaurant data from the NYC Open Data API.

    Parameters:
        app_token (str): Your application token for the API.
        username (str): Your username for the API (email).
        password (str): Your password for the API.
        dataset_id (str): The dataset identifier in Socrata.
        limit (int): The maximum number of results to fetch (default is 1000).

    Returns:
        pd.DataFrame: A pandas DataFrame containing the restaurant data.
    """
    # Initialize the Socrata client
    client = Socrata("data.cityofnewyork.us", app_token, username=username, password=password)

    # Fetch data
    results = client.get(dataset_id, limit=limit)

    # Convert results to a pandas DataFrame
    df_restaurants = pd.DataFrame.from_records(results)

    return df_restaurants

# Example usage:
# app_token = "your_app_token"
# username = "your_username"
# password = "your_password"
#df_restaurants = fetch_restaurant_data(app_token, username, password)
# print(df.head())


In [40]:
def parse_worksheet(xml_file_path, sheet_name, header_row_index=0):
    """
    Parse a specified worksheet from an XML-based Excel workbook.

    Parameters:
        xml_file_path (str): Path to the XML file.
        sheet_name (str): Name of the worksheet to parse.
        header_row_index (int): The row index of the headers (default is 0).

    Returns:
        pd.DataFrame: A pandas DataFrame containing the data from the worksheet.
    """
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Namespace dictionary for handling XML namespaces
    ns = {'ss': 'urn:schemas-microsoft-com:office:spreadsheet'}

    # Find the specified worksheet
    worksheet = root.find(f".//ss:Worksheet[@ss:Name='{sheet_name}']", ns)
    if worksheet is None:
        raise ValueError(f"Worksheet '{sheet_name}' not found in the file.")

    # Find all rows in the worksheet
    rows = worksheet.findall(".//ss:Row", ns)

    if not rows:
        raise ValueError(f"Worksheet '{sheet_name}' is empty.")

    # Extract headers from the specified header row
    header_row = rows[header_row_index]
    headers = [
        cell.find(".//ss:Data", ns).text if cell.find(".//ss:Data", ns) is not None else None
        for cell in header_row.findall(".//ss:Cell", ns)
    ]
    expected_columns = len(headers)

    # Extract data from rows after the header
    data = []
    for row in rows[header_row_index + 1:]:
        cells = row.findall(".//ss:Cell", ns)
        row_data = [
            cell.find(".//ss:Data", ns).text if i < len(cells) and cell.find(".//ss:Data", ns) is not None else None
            for i, cell in enumerate(cells[:expected_columns])
        ]
        data.append(row_data)

    # Create a pandas DataFrame
    return pd.DataFrame(data, columns=headers)


def fetch_fliming_locations_data(xml_file_path):
    """
    Fetch the "Full Map List" worksheet as a pandas DataFrame.

    Parameters:
        xml_file_path (str): Path to the XML file.

    Returns:
        pd.DataFrame: A pandas DataFrame containing data from the "Full Map List" worksheet.
    """
    sheet_name = "Full Map List"
    header_row_index = 2  # Assuming the 3rd row (index 2) is the header row
    return parse_worksheet(xml_file_path, sheet_name, header_row_index)

In [41]:
def fetch_movies_data(kaggle_dataset, filename="25k IMDb movie Dataset.csv"):
    """
    Download the latest version of a Kaggle dataset and return the movies DataFrame.

    Parameters:
        kaggle_dataset (str): The Kaggle dataset identifier (e.g., "utsh0dey/25k-movie-dataset").
        filename (str): The name of the CSV file to load (default is "25k IMDb movie Dataset.csv").

    Returns:
        pd.DataFrame: A pandas DataFrame containing the movies data.
    """
    # Download the latest version of the dataset
    path = kagglehub.dataset_download(kaggle_dataset)
    print("Path to dataset files:", path)
    
    # Construct the full path to the CSV file
    csv_path = f"{path}/{filename}"
    
    # Load the dataset into a pandas DataFrame
    df_movies = pd.read_csv(csv_path)
    
    return df_movies

In [45]:
df_restaurants = fetch_restaurant_data(app_token, username, password)
df_restaurants.head()

Unnamed: 0,objectid,globalid,seating_interest_sidewalk,restaurant_name,legal_business_name,doing_business_as_dba,bulding_number,street,borough,zip,...,community_board,council_district,census_tract,bin,bbl,nta,roadway_dimensions_length,roadway_dimensions_width,roadway_dimensions_area,landmarkdistrict_terms
0,100,c4b3155b-31a0-4e95-846f-fce09f245437,sidewalk,Pomp and Circumstance Hospitality,Pomp and Circumstance Hospitality LLC,Pomp and Circumstance Hospitality LLC,577,Lorimer Street,Brooklyn,11211,...,1,34,501,3068653.0,3027560028.0,East Williamsburg,,,,
1,1000,753495d8-4429-43e5-85a3-dcf6230ef749,both,Charm Kao,193 Schemerhorn INC,Charm Kao,193,Schermerhorn St.,Brooklyn,11201,...,2,33,37,3000493.0,3001640041.0,DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill,24.0,8.0,192.0,
2,10000,{3842B5C5-EF04-41A4-8216-D6EA627DCE5E},openstreets,SAKE BAR HAGI 46,"HAMA NEW YORK, INC.",SAKE BAR HAGI 46,358,W. 46TH STREET,Manhattan,10036,...,4,3,121,1025025.0,1010360057.0,Clinton,,,,
3,10001,{C212A0FC-C115-4425-8F95-931B12C5F86A},openstreets,Yum yum too,Boythaicorp,Boythaicorp,662,9ave,Manhattan,10036,...,4,3,127,1025038.0,1010370001.0,Clinton,,,,
4,10002,{DA48265D-7730-416F-8E1C-EBC8C8ACE2C2},openstreets,Xochil Pizza Corp,Xochil Pizza Corp,Xochil Pizza Corp,4632,5th Avenue,Brooklyn,11220,...,7,38,80,,,Sunset Park West,,,,


In [46]:
# Example usage
xml_file_path = "./datasets/Interactive_Map_Data.xml"
df_fliming_locations = fetch_fliming_locations_data(xml_file_path)
df_fliming_locations.head()

Unnamed: 0,*batteries not included,1987,%2Abatteries%20not%20included,batteriesnotincluded_pf,Courtesy of Photofest,Directed by,Matthew Robbins,http://imdb.com/name/nm0730422/,E. 5th St.<br>East Village<br>Manhattan,40.722445296182798,...,East Village,N/A,Film,http://imdb.com/title/tt0092494/,N/A.1,None,Y,190,None.1,Y.1
0,12 Angry Men,1957,12%20Angry%20Men,12AngryMen_pf,Courtesy of Photofest,Directed by,Sidney Lumet,http://imdb.com/name/nm0001486/,New York County Courthouse<br>40 Foley Square<...,40.7137,...,Lower Manhattan,,Film,http://imdb.com/title/tt0050083/,New York County\nCourthouse on Foley Square.,,Y,40,,Y
1,13 Going on 30,2004,13%20Going%20on%2030,13Goingon30_ec,"Courtesy of Everett Collection, Inc.",Directed by,Gary Winick,http://imdb.com/name/nm0935095/,W. 47th St. and Seventh Ave.<br>Times\n ...,40.759220487652094,...,Times Square,,Film,http://www.imdb.com/title/tt0337563/,47th St. and 7th Ave. Times Square\nManhattan,,Y,239,,Y
2,15 Minutes,2001,15%20Minutes,,,Directed by,John Herzfeld,http://imdb.com/name/nm0381273/,E. 60-66th St. and Madison Ave.<br>Upper East\...,40.7661,...,Upper East Side,Chases,Film,http://www.imdb.com/title/tt0179626/,60-66th and Madison,,Y,82,,N
3,25th Hour,2002,25th%20Hour,25thHour1_pf,Courtesy of Photofest,Directed by,Spike Lee,http://www.imdb.com/name/nm0000490/,World Trade Center<br>Lower Manhattan,40.7117926273691,...,Lower Manhattan,,Film,http://www.imdb.com/title/tt0307901/,World Trade Center,,Y,93,,Y
4,25th Hour,2002,25th%20Hour,25thHour2_pf,Courtesy of Photofest,Directed by,Spike Lee,http://www.imdb.com/name/nm0000490/,Carl Schurz Park<br>Upper East Side<br>Manhattan,40.775065592265,...,Upper East Side,,Film,http://www.imdb.com/title/tt0307901/,Carl Schurz Park btwn 84th and 85th and promenade,,Y,209,,Y


In [100]:
# Example usage
kaggle_dataset = "utsh0dey/25k-movie-dataset"
df = fetch_movies_data(kaggle_dataset)

Path to dataset files: C:\Users\huniv\.cache\kagglehub\datasets\utsh0dey\25k-movie-dataset\versions\1


In [122]:
df_movies = df

In [123]:
def extract_movie_id(df, path_column='path', new_column='imdb_id'):
    """
    Extract the unique movie ID from the 'path' field and add it as a new column.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing the 'path' column.
        path_column (str): The name of the column containing the path (default: 'path').
        new_column (str): The name of the new column for the extracted movie ID (default: 'movie_id').

    Returns:
        pd.DataFrame: The updated DataFrame with the extracted movie ID column.
    """
    # Check if the path column exists
    if path_column not in df.columns:
        raise ValueError(f"Column '{path_column}' not found in the DataFrame.")

    # Use regex to extract the movie ID from the path
    df[new_column] = df[path_column].str.extract(r'/title/(tt\d+)/')

    return df

In [124]:
def prepare_genres_column(df, old_genres_column='Generes', genres_column='genres'):
    """
    Rename the genres column and convert its values from strings to Python lists.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the genres column.
        old_genres_column (str): The current name of the genres column.
        genres_column (str): The new name for the genres column.

    Returns:
        pd.DataFrame: The updated DataFrame with the renamed and properly formatted genres column.

    Raises:
        ValueError: If the old_genres_column is not found in the DataFrame.
    """
    # Check if the column exists
    if old_genres_column not in df.columns:
        raise ValueError(f"Column '{old_genres_column}' not found in the DataFrame.")

    # Rename the column
    df = df.rename(columns={old_genres_column: genres_column})

    # Convert genres column from string to list
    df[genres_column] = df[genres_column].apply(ast.literal_eval)  # Safely convert string to list

    return df


def create_genres_table(df, genres_column='genres'):
    """
    Create a Genres DataFrame with unique genres and their IDs.

    Parameters:
        :param df_movies (pd.DataFrame): The movies DataFrame with a genres column.
        :param old_genres_column (str): The name of the genres column in the movies DataFrame.
        :param genres_column (str): The name of the genres column in the movies DataFrame.

    Returns:
        pd.DataFrame: A DataFrame with unique genres and their genre IDs.
    """

    # Extract unique genres and assign IDs
    all_genres = set(genre for genres_list in df[genres_column] for genre in genres_list)
    df_genres = pd.DataFrame({'genre': sorted(all_genres)})
    df_genres['genre_id'] = df_genres.index + 1  # Assign unique IDs starting from 1

    return df_genres


def create_movies_genres_table(df, df_genres, genres_column='genres', movie_id_column='imdb_id'):
    """
    Create a Movies_Genres DataFrame linking movies to genres by their IDs.

    Parameters:
        df (pd.DataFrame): The movies DataFrame with a genres column.
        df_genres (pd.DataFrame): The genres DataFrame with genre names and IDs.
        genres_column (str): The name of the genres column in the movies DataFrame.
        movie_id_column (str): The name of the unique movie identifier column.

    Returns:
        pd.DataFrame: A DataFrame linking movies (movie_id) to genres (genre_id).
    """
    # Explode genres into separate rows
    df_movies_expanded = df.explode(genres_column)

    # Map genres to their IDs using the Genres table
    df_movies_genres = df_movies_expanded[[movie_id_column, genres_column]].merge(
        df_genres, left_on=genres_column, right_on='genre'
    ).rename(columns={'genre_id': 'genre_id'})

    # Drop unnecessary columns and return the mapping
    return df_movies_genres[[movie_id_column, genres_column]]


In [125]:
df_movies = prepare_genres_column(df_movies)
df_movies = extract_movie_id(df_movies)
df_genres = create_genres_table(df_movies)
df_movies_genres = create_movies_genres_table(df_movies, df_genres, genres_column='genres', movie_id_column='imdb_id')
