### Task 0: 
Using https://www.nltk.org/howto/corpus.html#overview, implement TF-IDF vectorizer for e.g. Treebank corpus

In [2]:
import nltk
from nltk.corpus import treebank
import math
from collections import defaultdict

nltk.download('treebank')

class TFIDFVectorizer:
    def __init__(self):
        self.doc_freq = defaultdict(int)
        self.total_docs = 0               
        self.vocab = set()              
        self.idf = {}                    
        
    def fit(self, documents):
        """Calculate IDF values from the document collection"""
        self.total_docs = len(documents)
        
        # Calculate document frequency for each term
        for doc in documents:
            unique_terms = set(doc)
            for term in unique_terms:
                self.doc_freq[term] += 1
                self.vocab.add(term)
        # Calculate IDF for each term
        for term in self.vocab:
            self.idf[term] = math.log((self.total_docs + 1) / (self.doc_freq[term] + 1))
    
    def transform(self, documents):
        tfidf_vectors = []
        for doc in documents:
            # Calculate term frequencies for this document
            term_counts = defaultdict(int)
            for term in doc:
                term_counts[term] += 1
            
            # Calculate TF-IDF for each term in THIS document
            tfidf_vector = {}
            for term in term_counts: 
                count = term_counts[term]
                if count > 0:
                    tf = 1 + math.log(count)
                else:
                    tf = 0
                
                tfidf_vector[term] = tf * self.idf.get(term, 0)
            
            tfidf_vectors.append(tfidf_vector)
        
        return tfidf_vectors
    
    def fit_transform(self, documents):
        """Fit and transform in one step"""
        self.fit(documents)
        return self.transform(documents)

def preprocess_document(doc):
    return [word.lower() for word in doc if word.isalpha() and len(word) > 2]

documents = [preprocess_document(treebank.words(fileid)) 
             for fileid in treebank.fileids()[:100]]  

# Create and fit the TF-IDF vectorizer
vectorizer = TFIDFVectorizer()
tfidf_vectors = vectorizer.fit_transform(documents)

# Print some information
print(f"Number of documents: {len(documents)}")
print(f"Vocabulary size: {len(vectorizer.vocab)}")

# Get the first document's vector and sort terms by TF-IDF score
first_doc_vector = tfidf_vectors[0]
sorted_terms = sorted(first_doc_vector.items(), key=lambda x: x[1], reverse=True)

print("\nTop 10 terms in first document by TF-IDF score:")
for term, score in sorted_terms[:10]:
    print(f"{term}: {score:.4f}")

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\член\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Number of documents: 100
Vocabulary size: 6344

Top 10 terms in first document by TF-IDF score:
vinken: 6.6405
pierre: 3.9220
elsevier: 3.9220
dutch: 3.9220
join: 3.2288
nonexecutive: 2.8234
publishing: 2.8234
old: 1.9761
director: 1.6707
board: 1.5241
