In [1]:
import os
data_dir = '/home/schizoid-man/Documents/AWS_RAW_DATA'
txt_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
texts = []
for file in txt_files:
    with open(os.path.join(data_dir, file), 'r', encoding='utf-8') as f:
        texts.append(f.read())

In [2]:
import re

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = text.lower()
    return text

preprocessed_texts = [preprocess_text(text) for text in texts]

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
combined_text = ' '.join(preprocessed_texts)

# Apply TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=20)
X = vectorizer.fit_transform([combined_text])

# Get the keywords
keywords = vectorizer.get_feature_names_out()


In [4]:
from tqdm import tqdm
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

keywords_list = []
for text in preprocessed_texts:
    vectorizer = TfidfVectorizer(stop_words='english', max_features=20)
    tfidf_matrix = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    keywords_list.append(keywords)

# Extract named entities using NLTK with a progress bar
named_entities_list = []
for text in tqdm(texts, desc="Extracting named entities"):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    named_entities = ne_chunk(pos_tags)
    named_entities_list.append(named_entities)

Extracting named entities: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 347/347 [02:58<00:00,  1.95it/s]


In [5]:
for i, file in enumerate(txt_files):
    print(f"File: {file}")
    print(f"Keywords: {keywords_list[i]}")
    #print(f"Named Entities: {named_entities_list[i]}")
    print("\n")

File: Oportun Increases the Accuracy of Sensitive-Data Discovery by 95 Using Amazon Macie _ Oportun Case Study _ AWS.txt
Keywords: ['accuracy' 'amazon' 'aws' 'buckets' 'carlos' 'data' 'discovery'
 'identify' 'macie' 'oportun' 'pii' 'risk' 'says' 'scan' 'security'
 'service' 'services' 'solution' 'team' 'using']


File: Run Jobs at Scale While Optimizing for Cost Using Amazon EC2 Spot Instances with ActionIQ _ ActionIQ Case Study _ AWS.txt
Keywords: ['actioniq' 'amazon' 'aws' 'business' 'capacity' 'compute' 'costs'
 'customer' 'customers' 'data' 'ec' 'instances' 'jobs' 'ondemand'
 'reserved' 'run' 'scale' 'spot' 'using' 'workloads']


File: Scaling Text to Image to 100 Million Users Quickly Using Amazon SageMaker _ Canva Case Study _ AWS.txt
Keywords: ['amazon' 'aws' 'canva' 'company' 'feature' 'image' 'images' 'ml' 'models'
 'new' 'pink' 'quickly' 'rekognition' 'sagemaker' 'says' 'scaling'
 'texttoimage' 'user' 'users' 'using']


File: Showpad Accelerates Data Maturity to Unlock Innova

In [None]:
import matplotlib.pyplot as plt
from nltk.tree import Tree

# Convert named entities to tree structure
tree = Tree.fromstring(str(named_entities))

# Visualize the tree
tree.draw()
