In [1]:
import spacy
import pandas as pd

# Load the SpaCy model for English
nlp = spacy.load('en_core_web_sm')

# Function to segregate institution, place, and program using NER
def segregate_with_ner(name):
    # Process the name string with SpaCy's NER
    doc = nlp(name)
    
    institution_name = []
    place = []
    program = []

    # Loop through identified entities
    for ent in doc.ents:
        if ent.label_ == 'ORG':  # Organization (Institution Name)
            institution_name.append(ent.text)
        elif ent.label_ == 'GPE':  # Geopolitical Entity (Place)
            place.append(ent.text)

    # Anything not captured by ORG or GPE could be a program
    # We'll treat the rest of the text as part of the 'Program'
    remaining_tokens = [token.text for token in doc if not token.ent_type_]
    program = ' '.join(remaining_tokens)
    
    return ' '.join(institution_name), ' '.join(place), program

# Load your dataset
file_path = 'unmapped3.csv'  # Replace with the actual path to your CSV file
data = pd.read_csv(file_path)

# Apply the NER-based segregation function
data[['Institution Name', 'Place', 'Program']] = data['NAME'].apply(lambda x: pd.Series(segregate_with_ner(x)))

# Save the segregated data into a new CSV file
output_file_path = 'segregated_institution_data_with_ner.csv'
data[['NAME', 'Institution Name', 'Place', 'Program']].to_csv(output_file_path, index=False)

print(f"Data has been successfully segregated and saved to {output_file_path}")


Data has been successfully segregated and saved to segregated_institution_data_with_ner.csv
