In [17]:
# Load file from drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import re
import os
import glob
import pandas as pd

In [19]:
# Check abbreviated name
def is_abbreviated(name):
    """Check if a name is abbreviated (e.g., ends with a period)."""
    return name.endswith('.')

# Extract first name from authors' info
def extract_first_names(file_path):
    """Extract unique first names from a text file, skipping abbreviated names."""
    with open(file_path, 'r') as file:
        content = file.read()

    full_name_pattern = r'AUTHOR FULL NAMES: (.*?)(?=\n)'
    full_names = re.findall(full_name_pattern, content)
    unique_first_names = set()

    for full_name_line in full_names:
        names = [name.strip() for name in full_name_line.split(';') if name.strip()]
        for name in names:
            parts = name.split(',')
            if len(parts) > 1:
                first_name = parts[1].split()[0].strip()
            else:
                first_name = name.strip()
            if not is_abbreviated(first_name):
                unique_first_names.add(first_name)

    return list(unique_first_names)


def extract_names_from_folder(folder_path):
    """Extract and export first names from all text files in a folder."""
    file_paths = glob.glob(os.path.join(folder_path, '*.txt'))
    rows = []

    for path in file_paths:
        file_name = os.path.basename(path)
        first_names = extract_first_names(path)
        for name in first_names:
            rows.append({'file_name': file_name, 'first_name': name})

    df = pd.DataFrame(rows)
    df['year'] = df['file_name'].apply(extract_year)
    df['journal'] = df['file_name'].apply(extract_journal)
    return df

In [20]:
# Extract year and journal from file_name
def extract_year(name):
    match = re.search(r'(19|20)\d{2}', name)
    return match.group(0) if match else 'unknown'

def extract_journal(name):
    return name.split(',')[0].strip()

# Usage
folder_path = '/content/drive/My Drive/Thesis/TXT file'
output_csv_path = '/content/drive/My Drive/Thesis/processed_names.csv'

df = extract_names_from_folder(folder_path)
df.to_csv(output_csv_path, index=False)
print(f"Saved extracted names to: {output_csv_path}")


Saved extracted names to: /content/drive/My Drive/Thesis/processed_names.csv
