In [1]:
# Configuration to automatically reload modified modules
%load_ext autoreload
%autoreload 2

# This allows changes in imported modules to be reflected automatically
# without needing to restart the kernel

# Data and Preprocessing

## Build Data Frame with Corpus

In [2]:
from utils import build_corpus_dataframe

In [3]:
corpus_path = "data/Corpus-representacion"
corpus_raw = build_corpus_dataframe(corpus_path)

In [4]:
# df_corpus.to_csv("data/data_raw.csv", index=False)

In [5]:
for idx, row in corpus_raw.groupby("category").first().iterrows():
    print(f"Category: {idx}")
    print(f"Document ID: {row['document_id']}")
    print(f"Content: {row['content']}")
    print("-" * 40)

Category: comp.sys.ibm.pc.hardware
Document ID: 58980
Content: Newsgroups: comp.sys.ibm.pc.hardware
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!agate!boulder!ucsu!rintintin.Colorado.EDU!studner
From: studner@rintintin.Colorado.EDU (STUDNER  ROGER ALAN)
Subject: Modem for Sale
Message-ID: <1993Apr6.174054.5832@ucsu.Colorado.EDU>
Sender: news@ucsu.Colorado.EDU (USENET News System)
Nntp-Posting-Host: rintintin.colorado.edu
Organization: University of Colorado, Boulder
Date: Tue, 6 Apr 1993 17:40:54 GMT
Lines: 4

I am selling a USR HST 14.4k baud modem with v42bis compression upgrades.
THere is no manual, as it was lost going from one side of the U.S. to the other at some point.  THe modem is setup for max throughput, and it has built in help, but a quick reference guide on the bottom of it, so its use it not difficult by any means.
Any offers?


----------------------------------------
Category: comp.sys.mac.hardwar

## Initial Corpus Cleaning

In [6]:
from text_preprocessing import clean_header, preprocessing_pipeline, remove_writes_lines


corpus_clean_df = corpus_raw.copy()

corpus_clean_df["cleaned_content"] = (
    corpus_clean_df["content"].apply(clean_header).apply(remove_writes_lines)
)

In [7]:
for _idx, row in corpus_clean_df.sample(frac=1).iloc[:10].iterrows():
    print(f"Category: {row['category']}\n")
    print(f"Document ID: {row['document_id']}\n")
    print(f"Cleaned Content: {row['cleaned_content']}\n")
    print("-" * 40)

Category: sci.electronics

Document ID: 53603

Cleaned Content: -s87271077-s.walker-man-50- (swalker@uts.EDU.AU) wrote:
: 
: 
: I really don't know where to post this question so I figured that
: this board would be most appropriate.
: I was wondering about those massive concrete cylinders that
: are ever present at nuclear poer sites. They look like cylinders
: that have been pinched in the middle. Does anybody know what the
: actual purpose of those things are?. I hear that they're called
: 'Cooling Towers' but what the heck do they cool?
: I hope someone can help 
:-----------------------
During the nuclear fission reaction the uranium fuel can get hot enough
to melt. When this happens the liquid uranium is pumped to the cooling
tower where it is sprayed into the air. Contact with the cool outside air
will condense the mist and it will fall back to the cooling tower floor.
There it is collected by a cleaning crew using shop vacs and is then
reformed into pellets for reactor use the 

## Preprocessing

In [8]:
en_text = """
Hi, I wanted to ask you something, if I may. I'm thinking of buying a television for $32.50, although it's €33.00 on Amazon.
"""
print(preprocessing_pipeline(en_text))

['hi', 'i', 'want', 'to', 'ask', 'you', 'something', 'if', 'i', 'may', 'i', 'be', 'think', 'of', 'buy', 'a', 'television', 'for', '32.50', 'although', 'it', 'be', '33.00', 'on', 'amazon']


In [9]:
es_text = """
Hola, queria consultarte algo, puedo? Estoy pensando en comprar una television por 32.50$ aunque en amazon está por 33,00€
"""
print(preprocessing_pipeline(es_text, model="es_core_news_sm"))

['hola', 'queria', 'consultarte', 'algo', 'poder', 'estar', 'pensar', 'en', 'comprar', 'uno', 'television', 'por', '32.50', 'aunque', 'en', 'amazon', 'estar', 'por', '33.00']


In [10]:
print(preprocessing_pipeline(en_text, lemmatize=False))

['hi', 'i', 'wanted', 'to', 'ask', 'you', 'something', 'if', 'i', 'may', 'i', "'m", 'thinking', 'of', 'buying', 'a', 'television', 'for', '32.50', 'although', 'it', "'s", '33.00', 'on', 'amazon']


In [11]:
print(preprocessing_pipeline(corpus_clean_df["cleaned_content"].iloc[0], lemmatize=False))

['stuff', 'deleted', 'i', 'wrote', 'are', 'you', 'calling', 'names', 'or', 'giving', 'me', 'a', 'title', 'if', 'the', 'first', 'read', 'your', 'paragraph', 'above', 'if', 'not', 'i', 'accept', 'the', 'title', 'in', 'order', 'to', 'let', 'you', 'get', 'into', 'the', 'um', 'well', 'debate', 'again', 'hasan', 'replies', 'i', 'didnot', 'know', 'that', 'master', 'of', 'wisdom', 'can', 'be', 'name', 'clling', 'too', 'unless', 'you', 'consider', 'yourself', 'deserve', 'less', 'unless', 'you', 'are', 'referring', 'to', 'someone', 'else', 'you', 'have', 'in', 'fact', 'given', 'me', 'a', 'name', 'i', 'did', 'not', 'ask', 'for', 'hence', 'the', 'term', 'name', 'calling', 'so', 'what', 'do', 'you', 'expect', 'me', 'to', 'tell', 'you', 'to', 'tell', 'you', 'master', 'of', 'wsidom', 'i', 'replied', 'if', 'you', 'insist', 'on', 'giving', 'me', 'names', 'titles', 'i', 'did', 'not', 'ask', 'for', 'you', 'could', 'at', 'least', 'spell', 'them', 'correctly', '/sigh', 'hasan', 'gloats', 'that', 'was', 'on

# Vector Space Model

In [12]:
from vectorizing import vectorize_text

In [13]:
tf_vectors, tf_vocab = vectorize_text(corpus_clean_df["cleaned_content"], method="bow")
tfidf_vectors, tfidf_vocab = vectorize_text(corpus_clean_df["cleaned_content"], method="tfidf")

assert tf_vocab == tfidf_vocab, "Vocabulary mismatch between BOW and TF-IDF methods"

## Save Vectors

To save the TF-IDF and BOW vectors

### SciPy (.npz) (Recommended for large sparse matrices)
- **Advantages**: Maintains sparse format, very memory efficient
- **Disadvantages**: Requires saving vocabulary separately

In [14]:
from vectorizing import save_vectors_scipy  # , load_vectors_scipy

In [15]:
save_vectors_scipy(tf_vectors, tf_vocab, "data/VSM/tf_vectors")
save_vectors_scipy(tfidf_vectors, tfidf_vocab, "data/VSM/tfidf_vectors")
# tf_vectors, tf_vocab = load_vectors_scipy("data/VSM/tf_vectors")
# tfidf_vectors, tfidf_vocab = load_vectors_scipy("data/VSM/tfidf_vectors")

Matrices dispersas guardadas en data/VSM/tf_vectors.npz
Matrices dispersas guardadas en data/VSM/tfidf_vectors.npz


# Embeddings

In [None]:
# FastText Pre-trained Models
# There are several ways to download FastText models:

# Option 1: Facebook's official pre-trained FastText models
# Download from: https://dl.fbaipublicfiles.com/fasttext/vectors-english/
# Available models:
# - wiki-news-300d-1M.vec.zip (1M vocabulary, 300 dimensions)
# - crawl-300d-2M.vec.zip (2M vocabulary, 300 dimensions)

# Option 2: Using gensim.downloader (easiest method)
import gensim.downloader as api


# List available models
print("Available FastText models:")
for model_name in api.info()["models"]:
    if "fasttext" in model_name.lower():
        print(f"- {model_name}")

Available FastText models:
- fasttext-wiki-news-subwords-300
- fasttext-wiki-news-subwords-300


In [18]:
# Load a model using gensim downloader (this will download automatically)
print("\nLoading model...")
model = api.load("fasttext-wiki-news-subwords-300")
print("Model loaded successfully!")

# For this example, let's use a smaller model from gensim
print("Loading FastText model...")
model = api.load("fasttext-wiki-news-subwords-300")

print(model.most_similar("teacher"))
# Output = [('headteacher', 0.8075869083404541), ('schoolteacher', 0.7955552339553833), ('teachers', 0.733420729637146), ('teaches', 0.6839243173599243), ('meacher', 0.6825737357139587), ('teach', 0.6285147070884705), ('taught', 0.6244685649871826), ('teaching', 0.6199781894683838), ('schoolmaster', 0.6037642955780029), ('lessons', 0.5812176465988159)]

print(model.similarity("teacher", "teaches"))
# Output = 0.683924396754


Loading model...



KeyboardInterrupt: 

## FastText Model Download Options

### Option 1: Facebook's Official Pre-trained Models
Download directly from Facebook's FastText repository:

**English Models:**
- **Small**: [wiki-news-300d-1M.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip) (1M words, 300 dimensions, ~650MB)
- **Large**: [crawl-300d-2M.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip) (2M words, 300 dimensions, ~1.2GB)

**Spanish Models:**
- [cc.es.300.vec.gz](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz) (Spanish, 300 dimensions)

**Other Languages:**
- Visit: https://fasttext.cc/docs/en/crawl-vectors.html for 157 languages

### Option 2: Using Terminal Commands
```bash
# Download English FastText model
wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
unzip wiki-news-300d-1M.vec.zip

# Or for Spanish
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz
gunzip cc.es.300.vec.gz
```

### Option 3: Using Python to Download

In [None]:
# Python script to download FastText models automatically
import gzip
import urllib.request
from pathlib import Path


def download_fasttext_model(language="en", size="small"):
    """
    Download FastText pre-trained models

    Args:
        language: 'en' for English, 'es' for Spanish, etc.
        size: 'small' (1M words) or 'large' (2M words) for English
    """

    # Create models directory
    models_dir = Path("models")
    models_dir.mkdir(exist_ok=True)

    if language == "en":
        if size == "small":
            url = (
                "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
            )
            filename = "wiki-news-300d-1M.vec.zip"
        else:
            url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
            filename = "crawl-300d-2M.vec.zip"
    elif language == "es":
        url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz"
        filename = "cc.es.300.vec.gz"
    else:
        # For other languages, use the common crawl format
        url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{language}.300.vec.gz"
        filename = f"cc.{language}.300.vec.gz"

    filepath = models_dir / filename

    if not filepath.exists():
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filepath)
        print(f"Downloaded to {filepath}")

        # Extract if needed
        if filename.endswith(".zip"):
            import zipfile

            with zipfile.ZipFile(filepath, "r") as zip_ref:
                zip_ref.extractall(models_dir)
            print(f"Extracted {filename}")
        elif filename.endswith(".gz"):
            with gzip.open(filepath, "rb") as f_in:
                with open(filepath.with_suffix(""), "wb") as f_out:
                    f_out.write(f_in.read())
            print(f"Extracted {filename}")
    else:
        print(f"{filename} already exists")

    return filepath


# Example usage:
# download_fasttext_model('en', 'small')  # English small model
# download_fasttext_model('es')           # Spanish model

In [None]:
# Example: Download and load a FastText model

# Uncomment one of these to download:
# download_fasttext_model('en', 'small')  # Downloads wiki-news-300d-1M.vec
# download_fasttext_model('es')           # Downloads Spanish model

# Load the downloaded model

# For English model (adjust path as needed):
# model = KeyedVectors.load_word2vec_format('models/wiki-news-300d-1M.vec', binary=False)

# For Spanish model:
# model = KeyedVectors.load_word2vec_format('models/cc.es.300.vec', binary=False)

# Alternative: Use gensim's built-in downloader (easier but limited options)
import gensim.downloader as api


print("Available models with 'fasttext' in name:")
for model_name in api.info()["models"]:
    if "fasttext" in model_name.lower():
        print(f"  - {model_name}")

# Load a model using gensim downloader (this will download automatically)
print("\nLoading model...")
model = api.load("fasttext-wiki-news-subwords-300")
print("Model loaded successfully!")