In [2]:
import spacy
import pandas as pd
from geopy.geocoders import Nominatim

# Load spaCy's pre-trained model for NER
nlp = spacy.load("en_core_web_sm")

# Initialize geopy Nominatim for place detection
geolocator = Nominatim(user_agent="geoapiExercises")

# Function to extract institution, program, and place with custom logic
def extract_info(text):
    doc = nlp(text)
    institution = None
    place = None
    program = None

    # Extract named entities using spaCy
    entities = {ent.label_: ent.text for ent in doc.ents}

    # Detect institution (ORG) and place (GPE)
    institution = entities.get("ORG", None)  # ORG -> Institution
    place = entities.get("GPE", None)  # GPE -> Place

    # Use geopy to enhance place detection if spaCy doesn't detect it
    if not place:
        location = geolocator.geocode(text)
        if location:
            place = location.address.split(",")[0]

    # Extract program name by excluding institution and place from text
    program = text
    if institution:
        program = program.replace(institution, "").strip(" -")
    if place:
        program = program.replace(place, "").strip(" -")

    # After removing institution and place, the remainder is considered the program
    return {
        "Institution": institution,
        "Program": program,
        "Place": place
    }

# Load CSV data (replace with your actual CSV file path)
input_csv = 'unmapped3.csv'
df = pd.read_csv(input_csv)

# List to store processed results
extracted_data = []

# Process each entry in the CSV
for index, row in df.iterrows():
    text = row['NAME']
    extracted_info = extract_info(text)
    extracted_data.append(extracted_info)

# Convert extracted data to a DataFrame
extracted_df = pd.DataFrame(extracted_data)

# Combine the original DataFrame with the new extracted columns
output_df = pd.concat([df, extracted_df], axis=1)

# Save the processed data to a new CSV file
output_csv = 'out.csv'
output_df.to_csv(output_csv, index=False)

print(f"Data processed and saved to {output_csv}.")


GeocoderInsufficientPrivileges: Non-successful status code 403