# NLP Clustering Prototype

## Step 1: Test Installation:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from sentence_transformers import SentenceTransformer

print("Hello World from the Conda environment!")
print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)

# Load spaCy model to verify
nlp = spacy.load("en_core_web_sm")
print("spaCy model loaded successfully!")

# Test the sentence transformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')
test_embedding = embedder.encode("Hello, world!")
print("SentenceTransformer output:", test_embedding[:5])  # Print first 5 values


Hello World from the Conda environment!
Pandas version: 2.2.3
NumPy version: 2.0.2
spaCy model loaded successfully!
SentenceTransformer output: [-0.03817711  0.03291108 -0.0054594   0.01436987 -0.04029102]


## Step 2: Perform Initial Cleaning of Raw Data:

In [2]:
# run the cleaning script:
%run ../scripts/preprocess_cleaning.py

# metrics on the output file:
df = pd.read_csv('../data/processed/cleaned_data_example.csv')

# Calculate total cases processed
total_cases = len(df)

# Count cases with one or more missing free-text fields
cases_with_missing = df[['text 1 missing', 'text 2 missing', 'text 3 missing']] \
    .apply(lambda row: 'Y' in row.values, axis=1).sum()

print(f"Total Cases Processed: {total_cases}")
print(f"Cases with one or more missing text fields: {cases_with_missing}")

Raw data has been cleaned
Total Cases Processed: 30
Cases with one or more missing text fields: 16


## Step 3: Perform Language Detection on Cleaned Data:

In [4]:
# run the langdetect script:
%run ../scripts/preprocess_langdetect.py

# metrics on the output file:
df = pd.read_csv('../data/processed/langdetect_data_example.csv')

# Calculate detected languages
language_counts = df['language'].value_counts()
print("Detected Languages and Case Counts:")
for lang, count in language_counts.items():
    print(f"{lang}: {count}")


Language detection complete.
Detected Languages and Case Counts:
en: 14
ko: 14
unknown: 2


In [5]:
# run the Helsinki NLP script:
%run ../scripts/preprocess_helsinki_nlp.py

read the output from the previous step
Detected non-English languages: ['ko']


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Failed to load translation model for 'ko' using 'Helsinki-NLP/opus-mt-ko-en': This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.
No official model found for language 'ko'. Consider looking for an unofficial package.

Translation pipelines available for languages:


model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]