# NLP Clustering Prototype

### Step 1: Test Installation:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy.cli import download
from sentence_transformers import SentenceTransformer

print("Hello World from the Conda environment!")
print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)

# Load spaCy model to verify
if spacy.util.is_package("en_core_web_sm"):
    print("English SpaCY model already available")
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded successfully!")
else:
    print("downloading en_core_web_sm")
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded successfully!")

# Test the sentence transformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')
test_embedding = embedder.encode("Hello, world!")
print("SentenceTransformer output:", test_embedding[:5])  # Print first 5 values


Hello World from the Conda environment!
Pandas version: 2.2.3
NumPy version: 2.2.2
 English SpaCY model already available
spaCy model loaded successfully!
SentenceTransformer output: [-0.03817714  0.03291109 -0.00545938  0.01436994 -0.04029103]


### Step 2: Perform Initial Cleaning of Raw Data:

In [None]:
# run the cleaning script:
%run ../scripts/preprocess_cleaning.py

# metrics on the output file:
df = pd.read_csv('../data/processed/cleaned_data_example.csv')

# Calculate total cases processed
total_cases = len(df)

# Count cases with one or more missing free-text fields
cases_with_missing = df[['text 1 missing', 'text 2 missing', 'text 3 missing']] \
    .apply(lambda row: 'Y' in row.values, axis=1).sum()

print(f"Total Cases Processed: {total_cases}")
print(f"Cases with one or more missing text fields: {cases_with_missing}")

### Step 3: Perform Language Detection on Cleaned Data:

In [None]:
# run the langdetect script:
%run ../scripts/preprocess_langdetect.py

# metrics on the output file:
df = pd.read_csv('../data/processed/langdetect_data_example.csv')

# Calculate detected languages
language_counts = df['language'].value_counts()
print("Detected Languages and Case Counts:")
for lang, count in language_counts.items():
    print(f"{lang}: {count}")

### Step 4: Download Helsinki-NLP models for the detected languages

In [None]:
# run the Helsinki-NLP script:
%run ../scripts/preprocess_helsinki_nlp.py
print("ran the helsinki script")

### Step 5: Now that all text is in English, Tokenize with SpaCy

In [None]:
# run the SpaCy tokenizing script:
%run ../scripts/nlp_tokenizing.py