In [1]:
# Configuration to automatically reload modified modules
%load_ext autoreload
%autoreload 2

# This allows changes in imported modules to be reflected automatically
# without needing to restart the kernel

# Data and Preprocessing

## Build Data Frame with Corpus

In [2]:
from utils import build_corpus_dataframe

In [3]:
corpus_path = "data/Corpus-representacion"
corpus_raw = build_corpus_dataframe(corpus_path)

In [4]:
# df_corpus.to_csv("data/data_raw.csv", index=False)

In [5]:
for idx, row in corpus_raw.groupby("category").first().iterrows():
    print(f"Category: {idx}")
    print(f"Document ID: {row['document_id']}")
    print(f"Content: {row['content']}")
    print("-" * 40)

Category: comp.sys.ibm.pc.hardware
Document ID: 58980
Content: Newsgroups: comp.sys.ibm.pc.hardware
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!agate!boulder!ucsu!rintintin.Colorado.EDU!studner
From: studner@rintintin.Colorado.EDU (STUDNER  ROGER ALAN)
Subject: Modem for Sale
Message-ID: <1993Apr6.174054.5832@ucsu.Colorado.EDU>
Sender: news@ucsu.Colorado.EDU (USENET News System)
Nntp-Posting-Host: rintintin.colorado.edu
Organization: University of Colorado, Boulder
Date: Tue, 6 Apr 1993 17:40:54 GMT
Lines: 4

I am selling a USR HST 14.4k baud modem with v42bis compression upgrades.
THere is no manual, as it was lost going from one side of the U.S. to the other at some point.  THe modem is setup for max throughput, and it has built in help, but a quick reference guide on the bottom of it, so its use it not difficult by any means.
Any offers?


----------------------------------------
Category: comp.sys.mac.hardwar

## Initial Corpus Cleaning

In [6]:
from text_preprocessing import clean_header, preprocessing_pipeline, remove_writes_lines


corpus_clean_df = corpus_raw.copy()

corpus_clean_df["cleaned_content"] = (
    corpus_clean_df["content"].apply(clean_header).apply(remove_writes_lines)
)

In [7]:
# corpus_clean_df["cleaned_content"].to_csv("data/corpus_cleaned.csv", index=False)

In [8]:
for _idx, row in corpus_clean_df.sample(frac=1).iloc[:10].iterrows():
    print(f"Category: {row['category']}\n")
    print(f"Document ID: {row['document_id']}\n")
    print(f"Cleaned Content: {row['cleaned_content']}\n")
    print("-" * 40)

Category: sci.electronics

Document ID: 53549

Cleaned Content: >That's scary -- if there's a way to set an arbitrary mileage figure
>into the odometer, you can't trust *any* odometer reading, even
>if you can prove that the odometer itself is the same one that
>came on the car originally.

  I was wondering if anyone can shed any light on just how it is that these
electronic odometers remember the total elapsed mileage?  What kind of
memory is stable/reliable enough, non-volatile enough and independent enough
(of outside battery power) to last say, 10 years or more, in the life of a
vehicle?  I'm amazed that anything like this could be expected to work for
this length of time (especially in light of all the gizmos I work with that
are doing good to work for 2 months without breaking down somehow).

Side question:  how about the legal ramifications of selling a used car with
a replaced odometer that starts over at 0 miles, after say 100/200/300K
actual miles.  Looks like fraud would be

## Preprocessing

In [9]:
corpus_clean_and_preprocessed_df = corpus_clean_df.copy()
corpus_clean_and_preprocessed_df["preprocessed_content_for_vsm"] = preprocessing_pipeline(
    corpus_clean_and_preprocessed_df["cleaned_content"]
)
corpus_clean_and_preprocessed_df["preprocessed_content_for_embedding"] = preprocessing_pipeline(
    corpus_clean_and_preprocessed_df["cleaned_content"], lemmatize=False
)

Processing documents...: 100%|██████████| 805/805 [00:56<00:00, 14.15it/s] 
Processing documents...: 100%|██████████| 805/805 [00:42<00:00, 18.96it/s]


In [10]:
corpus_clean_and_preprocessed_df

Unnamed: 0,category,document_id,content,cleaned_content,preprocessed_content_for_vsm,preprocessed_content_for_embedding
0,talk.politics.mideast,75406,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....,[ stuff deleted ]\n |> I wrote:\n |> Are y...,stuff delete i write be you call name or give ...,stuff deleted i wrote are you calling names or...
1,talk.politics.mideast,75401,Newsgroups: talk.politics.mideast\nPath: canta...,|> In article <C50wJJ.J4r@newsflash.concordia....,in article c50wjj.j4r@newsflash.concordia.ca i...,in article c50wjj.j4r@newsflash.concordia.ca i...
2,talk.politics.mideast,75408,Xref: cantaloupe.srv.cs.cmu.edu soc.culture.tu...,DEPOSITION of VITALY NIKOLAYEVICH DANIELIAN [1...,deposition of vitaly nikolayevich danielian 1 ...,deposition of vitaly nikolayevich danielian 1 ...
3,talk.politics.mideast,75409,Xref: cantaloupe.srv.cs.cmu.edu alt.conspiracy...,>> It is getting ridiculous. You are breaking ...,it be get ridiculous you be break the article ...,it is getting ridiculous you are breaking the ...
4,talk.politics.mideast,75400,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....,>>> Historically even the most uncivilized o...,historically even the most uncivilized of peop...,historically even the most uncivilized of peop...
...,...,...,...,...,...,...
800,talk.politics.guns,54305,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....,> Correct. You'd have to be very unfamiliar ...,correct you would have to be very unfamiliar w...,correct you 'd have to be very unfamiliar with...
801,talk.politics.guns,54302,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....,">What, exactly, do you think the Jewish zelots...",what exactly do you think the jewish zelot at ...,what exactly do you think the jewish zelots at...
802,talk.politics.guns,54333,Newsgroups: talk.politics.guns\nPath: cantalou...,I will add my voice to the (hopefully) growing...,i will add my voice to the hopefully grow mult...,i will add my voice to the hopefully growing m...
803,talk.politics.guns,54199,Newsgroups: talk.politics.guns\nPath: cantalou...,"Hi. I've just finished reading S414, and have ...",hi i have just finish read s414 and have sever...,hi i 've just finished reading s414 and have s...


# Vector Space Model

In [11]:
from vectorizing import vectorize_text

In [13]:
tf_vectors, tf_vocab = vectorize_text(
    corpus_clean_and_preprocessed_df["preprocessed_content_for_vsm"], method="bow"
)
tfidf_vectors, tfidf_vocab = vectorize_text(
    corpus_clean_and_preprocessed_df["preprocessed_content_for_vsm"], method="tfidf"
)

assert tf_vocab == tfidf_vocab, "Vocabulary mismatch between BOW and TF-IDF methods"

## Save Vectors

To save the TF-IDF and BOW vectors

### SciPy (.npz) (Recommended for large sparse matrices)
- **Advantages**: Maintains sparse format, very memory efficient
- **Disadvantages**: Requires saving vocabulary separately

In [15]:
from vectorizing import save_vectors_scipy  # , load_vectors_scipy

In [16]:
save_vectors_scipy(tf_vectors, tf_vocab, "data/VSM/tf_vectors")
save_vectors_scipy(tfidf_vectors, tfidf_vocab, "data/VSM/tfidf_vectors")
# tf_vectors, tf_vocab = load_vectors_scipy("data/VSM/tf_vectors")
# tfidf_vectors, tfidf_vocab = load_vectors_scipy("data/VSM/tfidf_vectors")

Sparse matrices saved to data/VSM/tf_vectors.npz
Sparse matrices saved to data/VSM/tfidf_vectors.npz


# Embeddings

In [18]:
# FastText Pre-trained Models
# There are several ways to download FastText models:

# Option 1: Facebook's official pre-trained FastText models
# Download from: https://dl.fbaipublicfiles.com/fasttext/vectors-english/
# Available models:
# - wiki-news-300d-1M.vec.zip (1M vocabulary, 300 dimensions)
# - crawl-300d-2M.vec.zip (2M vocabulary, 300 dimensions)

# Option 2: Using gensim.downloader (easiest method)
import gensim.downloader as api


# List available models
print("Available FastText models:")
for model_name in api.info()["models"]:
    if "fasttext" in model_name.lower():
        print(f"- {model_name}")

Available FastText models:
- fasttext-wiki-news-subwords-300


In [19]:
# Load a model using gensim downloader (this will download automatically)
print("\nLoading model...")
model = api.load("fasttext-wiki-news-subwords-300")
print("Model loaded successfully!")


Loading model...
Model loaded successfully!


In [20]:
print(model.most_similar("teacher"))
# Output = [('headteacher', 0.8075869083404541), ('schoolteacher', 0.7955552339553833), ('teachers', 0.733420729637146), ('teaches', 0.6839243173599243), ('meacher', 0.6825737357139587), ('teach', 0.6285147070884705), ('taught', 0.6244685649871826), ('teaching', 0.6199781894683838), ('schoolmaster', 0.6037642955780029), ('lessons', 0.5812176465988159)]

[('educator', 0.7812017798423767), ('schoolteacher', 0.7797068953514099), ('teachers', 0.763569176197052), ('student', 0.7560623288154602), ('teacher-', 0.7527834177017212), ('school-teacher', 0.7409082651138306), ('paraeducator', 0.7359001040458679), ('teacher-librarian', 0.7357273697853088), ('teacher-researcher', 0.7300713658332825), ('teacher-parent', 0.7287502288818359)]


In [22]:
print(model.similarity("teacher-", "teached"))
# Output = 0.683924396754

0.5791656


## FastText Model Download Options

### Option 1: Facebook's Official Pre-trained Models
Download directly from Facebook's FastText repository:

**English Models:**
- **Small**: [wiki-news-300d-1M.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip) (1M words, 300 dimensions, ~650MB)
- **Large**: [crawl-300d-2M.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip) (2M words, 300 dimensions, ~1.2GB)

**Spanish Models:**
- [cc.es.300.vec.gz](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz) (Spanish, 300 dimensions)

### Option 2: Using Terminal Commands
```bash
# Download English FastText model
wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
unzip wiki-news-300d-1M.vec.zip

# Or for Spanish
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz
gunzip cc.es.300.vec.gz
```

### Option 3: Using Python to Download

In [None]:
# Python script to download FastText models automatically
import gzip
import urllib.request
from pathlib import Path


def download_fasttext_model(language="en", size="small"):
    """
    Download FastText pre-trained models

    Args:
        language: 'en' for English, 'es' for Spanish, etc.
        size: 'small' (1M words) or 'large' (2M words) for English
    """

    # Create models directory
    models_dir = Path("models")
    models_dir.mkdir(exist_ok=True)

    if language == "en":
        if size == "small":
            url = (
                "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
            )
            filename = "wiki-news-300d-1M.vec.zip"
        else:
            url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
            filename = "crawl-300d-2M.vec.zip"
    elif language == "es":
        url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz"
        filename = "cc.es.300.vec.gz"
    else:
        # For other languages, use the common crawl format
        url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{language}.300.vec.gz"
        filename = f"cc.{language}.300.vec.gz"

    filepath = models_dir / filename

    if not filepath.exists():
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filepath)
        print(f"Downloaded to {filepath}")

        # Extract if needed
        if filename.endswith(".zip"):
            import zipfile

            with zipfile.ZipFile(filepath, "r") as zip_ref:
                zip_ref.extractall(models_dir)
            print(f"Extracted {filename}")
        elif filename.endswith(".gz"):
            with gzip.open(filepath, "rb") as f_in:
                with open(filepath.with_suffix(""), "wb") as f_out:
                    f_out.write(f_in.read())
            print(f"Extracted {filename}")
    else:
        print(f"{filename} already exists")

    return filepath


# Example usage:
# download_fasttext_model('en', 'small')  # English small model
# download_fasttext_model('es')           # Spanish model