In [2]:
import os
import spacy
import pandas as pd
from collections import Counter, defaultdict

# Load the spaCy model and set a higher max_length limit if necessary
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000  # Adjust as needed for large texts

# Define a function to split text into smaller chunks
def split_text(text, chunk_size=100000):
    """Splits text into chunks of a specified size."""
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Define the folder containing the text files
folder_path = 'English'

# Initialize a defaultdict to track frequencies and types of named entities
location_counter = defaultdict(lambda: {'Frequency': 0, 'Type': ''})

# Process each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):  # Only process text files
        print(f"Processing file: {filename}")  # Print progress
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            
            # Check if the text length exceeds the max_length and split if necessary
            if len(text) > nlp.max_length:
                text_chunks = split_text(text)
            else:
                text_chunks = [text]
            
            # Process each chunk individually
            for chunk in text_chunks:
                doc = nlp(chunk)  # Process the chunk with spaCy
                
                # Extract named entities and update the counter for GPE, LOC, and NORP only
                for ent in doc.ents:
                    if ent.label_ in ["GPE", "LOC", "NORP"]:  # Check for GPE, LOC, or NORP labels
                        location_counter[ent.text]['Frequency'] += 1
                        location_counter[ent.text]['Type'] = ent.label_

# Convert the defaultdict to a DataFrame
location_data = [{'Location': loc, 'Frequency': data['Frequency'], 'Type': data['Type']} 
                 for loc, data in location_counter.items()]
location_df = pd.DataFrame(location_data)

# Sort the DataFrame by frequency in descending order
location_df.sort_values(by='Frequency', ascending=False, inplace=True)

# Output the results to an Excel file
output_file = 'location_frequencies.xlsx'
location_df.to_excel(output_file, index=False)

print(f"Location frequencies with entity types have been written to {output_file}")


Processing file: A Floating City.txt
Processing file: A Sub And A Submarine The Story Of H.M. Submarine R19 In The Great War.txt
Processing file: Adrift In The Wilds; Or, The Adventures Of Two Shipwrecked Boys.txt
Processing file: Allan Quatermain.txt
Processing file: An Aerial Bivouac.txt
Processing file: Ayesha, The Return Of She.txt
Processing file: A_Sailor_s_Bride.txt
Processing file: Benita.txt
Processing file: Black Heart and White Heart and Other Stories.txt
Processing file: By Pike And Dyke A Tale Of The Rise Of The Dutch Republic.txt
Processing file: By The Gods Beloved.txt
Processing file: Cleopatra.txt
Processing file: Eighty_Days.txt
Processing file: Eric Brighteyes.txt
Processing file: Fair Margaret.txt
Processing file: Gulliver's Travels into Several Remote Nations of the World.txt
Processing file: His Tramps and Troubles Told by Himself.txt
Processing file: Jack Manly.txt
Processing file: Jimmy_Brown_Trying_to_Find_Europe.txt
Processing file: King Solomon'S Mines.txt
Pr

In [5]:
import os
import spacy
import pandas as pd
from collections import Counter, defaultdict

# Load the French spaCy model
nlp = spacy.load("fr_core_news_sm")
nlp.max_length = 2000000  # Adjust as needed for large texts

# Define a function to split text into smaller chunks
def split_text(text, chunk_size=100000):
    """Splits text into chunks of a specified size."""
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Define the folder containing the text files
folder_path = 'French_cleaned'

# Initialize a defaultdict to track frequencies and types of named entities
location_counter = defaultdict(lambda: {'Frequency': 0, 'Type': ''})

# Process each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):  # Only process text files
        print(f"Processing file: {filename}")  # Print progress
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            
            # Check if the text length exceeds the max_length and split if necessary
            if len(text) > nlp.max_length:
                text_chunks = split_text(text)
            else:
                text_chunks = [text]
            
            # Process each chunk individually
            for chunk in text_chunks:
                doc = nlp(chunk)  # Process the chunk with spaCy
                
                # Extract named entities and update the counter for GPE, LOC, and NORP only
                for ent in doc.ents:
                    if ent.label_ in ["GPE", "LOC", "NORP"]:  # Check for GPE, LOC, or NORP labels
                        location_counter[ent.text]['Frequency'] += 1
                        location_counter[ent.text]['Type'] = ent.label_

# Convert the defaultdict to a DataFrame
location_data = [{'Location': loc, 'Frequency': data['Frequency'], 'Type': data['Type']} 
                 for loc, data in location_counter.items()]
location_df = pd.DataFrame(location_data)

# Sort the DataFrame by frequency in descending order
location_df.sort_values(by='Frequency', ascending=False, inplace=True)

# Output the results to an Excel file
output_file = 'location_frequencies_French.xlsx'
location_df.to_excel(output_file, index=False)

print(f"Location frequencies with entity types have been written to {output_file}")

Processing file: Autour de la Lune.txt
Processing file: Cinq Semaines En Ballon.txt
Processing file: De la Terre à la Lune.txt
Processing file: Famille-sans-nom.txt
Processing file: L'Lle Mysterieuse.txt
Processing file: La maison à vapeur.txt
Processing file: La_Vénus_noire.txt
Processing file: Le Tour Du Monde En Quatre-Vingts Jours.txt
Processing file: LES TROIS MOUSQUETAIRES.txt
Processing file: Michel Strogoff de Moscou à Irkoutsk .txt
Processing file: Un Billet de loterie.txt
Processing file: Un-dirigeable-au-pôle-nord-Émile-Driant-2015-BnF-Partenariats-9782346010363-55bc7c3be6791153da0a827f.txt
Processing file: Une Affaire Mystérieuse.txt
Processing file: Une Ville Flottante.txt
Processing file: Verne-vacances.txt
Processing file: Vingt Ans Après.txt
Processing file: Vingt Mille Lieues Sous Les Mers.txt
Processing file: Voyage Au Centre De La Terre.txt
Location frequencies with entity types have been written to location_frequencies_French.xlsx


In [11]:
import pandas as pd
from googletrans import Translator


# Specify the input file path
input_file_path = 'all_freq_country.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(input_file_path)

# Initialize the GoogleTranslator for French to English
translator = Translator()

# Translate the content of the 'phrase' column and store it in the 'translation' column
df['translation'] = df['phrase'].apply(lambda x: translator.translate(x,src='auto', dest='en') if pd.notnull(x) else x)

# Print the updated DataFrame (or use display in Jupyter Notebook)
print(df.head())

# Specify the output file path
output_file_path = 'all_freq_country_translated.xlsx'

# Save the updated DataFrame to a new Excel file
df.to_excel(output_file_path, index=False)

print(f"Updated DataFrame with translations has been saved to {output_file_path}.")


            phrase  frequency  \
0  中國/中華/支那/中/中華民國        298   
1         日本/日本國/日        105   
2               古巴         57   
3      英國/英/英倫/英吉利         74   
4               希臘         39   

                                         translation  
0  Translated(src=zh-TW, dest=en, text=China/Chin...  
1  Translated(src=ja, dest=en, text=Japan/Japan/d...  
2  Translated(src=zh-CN, dest=en, text=Cuba, pron...  
3  Translated(src=zh-TW, dest=en, text=British/Br...  
4  Translated(src=zh-TW, dest=en, text=Greece, pr...  
Updated DataFrame with translations has been saved to all_freq_country_translated.xlsx.


In [5]:
import pandas as pd
from googletrans import Translator
from time import sleep

# Specify the input file path
input_file_path = 'all_freq_trans_cities.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(input_file_path)

# Initialize the GoogleTranslator for French to English
translator = Translator()

# Function to safely translate the first word before a "/" and handle errors
def safe_translate_first_word(text):
    try:
        if pd.notnull(text):
            # Extract the first word before any "/"
            first_word = text.split('/')[0].split()[0]
            # Translate the first word
            translation = translator.translate(first_word, src='auto', dest='en')
            return translation.text.split()[0]  # Ensure only one English word is returned
    except Exception as e:
        print(f"Error translating '{text}': {e}")
        return text

# Apply the function to the 'phrase' column and store it in the 'translation' column
df['translation'] = df['Cities'].apply(safe_translate_first_word)

# Print the updated DataFrame (or use display in Jupyter Notebook)
print(df.head())

# Specify the output file path
output_file_path = 'all_freq_trans_cities_translated.xlsx'

# Save the updated DataFrame to a new Excel file
df.to_excel(output_file_path, index=False)

print(f"Updated DataFrame with translations has been saved to {output_file_path}.")


      Cities  frequency  Translation translation
0         倫敦        665          NaN      London
1  巴黎/巴黎市/勃黎        620          NaN       Paris
2     紐約/紐約克        186          NaN         New
3         開羅        167          NaN       Cairo
4     羅馬/羅馬城        162          NaN        Rome
Updated DataFrame with translations has been saved to all_freq_trans_cities_translated.xlsx.
