In [12]:
import os
import re

# Define a function to clean text
def clean_text(text):
    # Remove numbers and 'n' characters when standalone
    cleaned_text = re.sub(r'\d+', '', text)  # Remove numbers
    cleaned_text = re.sub(r'\bn\b', ' ', cleaned_text)  # Remove standalone 'n' characters
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)  # Remove punctuation
    return cleaned_text

# Specify the path to the "postprocessed" folder
folder_path = "postprocessed"

# Loop through all text files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r") as file:
            text = file.read()
            cleaned_text = clean_text(text)
        
        # Write the cleaned text back to the file
        with open(file_path, "w") as file:
            file.write(cleaned_text)

print("Text files cleaned successfully.")


Text files cleaned successfully.


In [13]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer

# Specify the path to the "postprocessed" folder
folder_path = "postprocessed"

# List to store the cleaned text from all files
documents = []

# Loop through all text files in the folder and collect the cleaned text
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r") as file:
            text = file.read()
            documents.append(text)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents to calculate TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get the feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a dictionary to store TF-IDF scores for each word
word_tfidf_scores = {}
for i, feature in enumerate(feature_names):
    word_tfidf_scores[feature] = tfidf_matrix[0, i]

# Print TF-IDF scores for each word in the first document
for word, score in sorted(word_tfidf_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{word}: {score:.4f}")


figure: 0.3537
plan: 0.3537
table: 0.2535
geology: 0.2298
exploration: 0.2199
project: 0.1977
status: 0.1950
bulong: 0.1753
knp: 0.1753
physiography: 0.1753
scoping: 0.1753
study: 0.1753
client: 0.1596
multi: 0.1596
rom: 0.1596
data: 0.1587
containing: 0.1485
cd: 0.1399
bibliography: 0.1329
tmi: 0.1329
previous: 0.1296
interpreted: 0.1269
conclusion: 0.1217
tenement: 0.1155
surface: 0.1131
on: 0.1095
interpretation: 0.1061
digital: 0.1030
review: 0.1002
summary: 0.0950
appendix: 0.0927
aeromagnetic: 0.0905
activities: 0.0864
index: 0.0827
recommendations: 0.0778
references: 0.0626
regional: 0.0542
and: 0.0495
location: 0.0440
abd: 0.0000
acacia: 0.0000
access: 0.0000
acquisitions: 0.0000
across: 0.0000
activity: 0.0000
activties: 0.0000
adjacent: 0.0000
adjoins: 0.0000
aeromagnetics: 0.0000
after: 0.0000
airborne: 0.0000
al: 0.0000
all: 0.0000
amoco: 0.0000
an: 0.0000
analyses: 0.0000
analysis: 0.0000
annual: 0.0000
anomalies: 0.0000
anomalism: 0.0000
anomalous: 0.0000
anomaly: 0.0000
