In [129]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import math
from collections import Counter

In [130]:

# Preprocessing function to clean and normalize text
def preprocess_and_normalize(text):
    # Remove HTML tags and CSS styles
    soup = BeautifulSoup(text, 'html.parser')
    clean_content = soup.get_text()
    clean_content = re.sub(r'<style.*?</style>', '', clean_content, flags=re.DOTALL)
    clean_content = re.sub(r'<link.*?>', '', clean_content, flags=re.DOTALL)

    # Convert text to lowercase
    text = clean_content.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Return preprocessed text as a single string
    return ' '.join(lemmatized_tokens)


In [131]:
# Preprocess and extract unique words from documents
documents = [
    "Lionel Andrés Messi, born on June 24, 1987, is an Argentine professional footballer. He plays as a forward and captains both Barcelona and the Argentina national team. Messi is widely regarded as one of the greatest football players of all time.",
    "Neymar da Silva Santos Júnior, commonly known as Neymar Jr., was born on February 5, 1992. He is a Brazilian professional footballer who plays as a forward for Paris Saint-Germain and the Brazil national team. Neymar is known for his dribbling, agility, and goal-scoring ability.",
    "Cristiano Ronaldo dos Santos Aveiro, born on February 5, 1985, is a Portuguese professional footballer. He is widely considered one of the greatest football players of all time. Ronaldo has won numerous awards and accolades throughout his career, including multiple FIFA Ballon d'Or awards.",
    "Sports encompass a wide range of physical activities or games that involve skill, competition, and physical exertion. Some popular sports include football, basketball, tennis, cricket, and athletics. Sports play a significant role in promoting physical fitness, teamwork, and personal development."
]

In [132]:
# Preprocess the documents
preprocessed_documents = [preprocess_and_normalize(doc) for doc in documents]


In [133]:
# Extract unique words from all documents
unique_words = set()
for doc in preprocessed_documents:
    unique_words.update(doc.split())

# Print unique words extracted from the documents
print("Unique Words:")
print(unique_words)
print('Number Of Unique Words : ', len(unique_words))

Unique Words:

{'basketball', 'play', 'promoting', 'football', 'argentina', 'paris', 'wide', 'game', 'ability', 'silva', 'argentine', 'numerous', 'widely', 'cristiano', '5', 'ballon', 'exertion', 'february', 'role', 'teamwork', 'jr', 'considered', 'commonly', 'dor', 'career', '24', 'include', 'captain', 'skill', 'sport', 'saintgermain', '1992', 'one', 'time', 'footballer', 'including', 'activity', 'accolade', 'known', 'june', 'dribbling', 'significant', 'multiple', '1987', 'agility', 'national', 'brazil', 'personal', 'fifa', 'player', 'tennis', '1985', 'do', 'athletics', 'encompass', 'involve', 'range', 'goalscoring', 'júnior', 'fitness', 'award', 'competition', 'da', 'brazilian', 'physical', 'andrés', 'forward', 'team', 'throughout', 'popular', 'greatest', 'neymar', 'messi', 'portuguese', 'santos', 'professional', 'born', 'ronaldo', 'regarded', 'lionel', 'barcelona', 'development', 'cricket', 'aveiro'}

Number Of Unique Words :  84


In [134]:
# Calculate TF-IDF using built-in functions (TfidfVectorizer from scikit-learn)
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed documents to obtain the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)

# Get the feature names (unique words) from the TfidfVectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print the shape of the feature names array and the TF-IDF matrix
print('Shape of Feature Names:', feature_names.shape)
print('Shape of TF_IDF Matrix:', tfidf_matrix.shape)

# Print TF-IDF using built-in functions
print("\nTF-IDF using built-in functions:")
print("Feature Names:", feature_names)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


Shape of Feature Names: (83,)

Shape of TF_IDF Matrix: (4, 83)



TF-IDF using built-in functions:

Feature Names: ['1985' '1987' '1992' '24' 'ability' 'accolade' 'activity' 'agility'

 'andrés' 'argentina' 'argentine' 'athletics' 'aveiro' 'award' 'ballon'

 'barcelona' 'basketball' 'born' 'brazil' 'brazilian' 'captain' 'career'

 'commonly' 'competition' 'considered' 'cricket' 'cristiano' 'da'

 'development' 'do' 'dor' 'dribbling' 'encompass' 'exertion' 'february'

 'fifa' 'fitness' 'football' 'footballer' 'forward' 'game' 'goalscoring'

 'greatest' 'include' 'including' 'involve' 'jr' 'june' 'júnior' 'known'

 'lionel' 'messi' 'multiple' 'national' 'neymar' 'numerous' 'one' 'paris'

 'personal' 'physical' 'play' 'player' 'popular' 'portuguese'

 'professional' 'promoting' 'range' 'regarded' 'role' 'ronaldo'

 'saintgermain' 'santos' 'significant' 'silva' 'skill' 'sport' 'team'

 'teamwork' 'tennis' 'throughout' 'time' 'wide' 'widely']

TF-IDF Matrix:

 [[0.         0.21816703 0.    

In [135]:
# Function to calculate term frequency (TF) for a term in a document
def term_frequency(term, document):
    # Count the occurrences of the term in the document
    word_counts = Counter(document.split())
    # Calculate the term frequency (TF) as the ratio of term occurrences to total words in the document
    return word_counts[term] / sum(word_counts.values())

In [136]:
# Function to calculate inverse document frequency (IDF) for a term across all documents
def inverse_document_frequency(term, documents):
    # Count the number of documents that contain the term
    num_documents_with_term = sum(1 for document in documents if term in document)
    # Calculate the IDF using the logarithm of the ratio of total documents to the number of documents containing the term
    return math.log(len(documents) / (1 + num_documents_with_term))

In [137]:
# Function to calculate TF-IDF for a term in a document
def tfidf(term, document, documents):
    # Calculate TF using the term_frequency function
    tf = term_frequency(term, document)
    # Calculate IDF using the inverse_document_frequency function
    idf = inverse_document_frequency(term, documents)
    # Return the product of TF and IDF as the TF-IDF score
    return tf * idf

In [138]:
# Calculate TF-IDF vectors for each document from scratch

# Initialize an empty list to store TF-IDF vectors for each document
tfidf_vectors = []

# Iterate through each preprocessed document
for doc in preprocessed_documents:
    # Initialize an empty dictionary to store TF-IDF scores for terms in the document
    document_vector = {}
    # Split the document into terms and iterate through each term
    for term in doc.split():
        # Calculate the TF-IDF score for the term in the current document
        score = tfidf(term, doc, preprocessed_documents)
        # Add the term and its TF-IDF score to the document vector dictionary
        document_vector[term] = score
    # Append the document vector dictionary to the list of TF-IDF vectors
    tfidf_vectors.append(document_vector)

# Print TF-IDF vectors calculated from scratch
print("\nTF-IDF from scratch:")
for i, vector in enumerate(tfidf_vectors, start=1):
    print(f"TF-IDF Vector for Document {i}: {vector}")




TF-IDF from scratch:

TF-IDF Vector for Document 1: {'lionel': 0.027725887222397813, 'andrés': 0.027725887222397813, 'messi': 0.055451774444795626, 'born': 0.0, 'june': 0.027725887222397813, '24': 0.027725887222397813, '1987': 0.027725887222397813, 'argentine': 0.027725887222397813, 'professional': 0.0, 'footballer': 0.0, 'play': -0.00892574205256839, 'forward': 0.011507282898071234, 'captain': 0.027725887222397813, 'barcelona': 0.027725887222397813, 'argentina': 0.027725887222397813, 'national': 0.011507282898071234, 'team': 0.0, 'widely': 0.011507282898071234, 'regarded': 0.027725887222397813, 'one': 0.011507282898071234, 'greatest': 0.011507282898071234, 'football': -0.00892574205256839, 'player': 0.011507282898071234, 'time': 0.011507282898071234}

TF-IDF Vector for Document 2: {'neymar': 0.07170488074758055, 'da': 0.02390162691586018, 'silva': 0.02390162691586018, 'santos': 0.009920071463854511, 'júnior': 0.02390162691586018, 'commonly': 0.02390162691586018, 'known': 0.047803253