In [191]:
!pip install wikipedia










In [192]:
import wikipedia
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import math
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [193]:
# Function to fetch content from Wikipedia
def fetch_wikipedia_content(title):
    try:
        # Try to retrieve the Wikipedia page content for the given title
        page = wikipedia.page(title)
        # If successful, return the content of the page
        return page.content
    except wikipedia.exceptions.DisambiguationError as e:
        # Handle DisambiguationError if the title is ambiguous
        print(f"DisambiguationError: {e}")
    except wikipedia.exceptions.PageError as e:
        # Handle PageError if the title does not match any pages
        print(f"PageError: {e}")
    except Exception as e:
        # Handle any other unexpected exceptions
        print(f"Error: {e}")
    # Return None if content retrieval fails
    return None

In [194]:
# Preprocessing function to clean and normalize text
def preprocess_and_normalize(text):
    # Remove HTML tags and CSS styles
    soup = BeautifulSoup(text, 'html.parser')
    clean_content = soup.get_text()
    clean_content = re.sub(r'<style.*?</style>', '', clean_content, flags=re.DOTALL)
    clean_content = re.sub(r'<link.*?>', '', clean_content, flags=re.DOTALL)

    # Convert text to lowercase
    text = clean_content.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Return preprocessed text as a single string
    return ' '.join(lemmatized_tokens)

In [195]:
# List of Wikipedia page titles
page_titles = [
    "Neymar Da Silva",
    "Tourism",
    "Cristiano Ronaldo",
    "Sport"
]

# Fetch content from Wikipedia and preprocess documents
preprocessed_documents = []
for title in page_titles:
    # Fetch content from Wikipedia for the current title
    content = fetch_wikipedia_content(title)
    # If content is retrieved successfully, preprocess and normalize it
    if content:
        preprocessed_documents.append(preprocess_and_normalize(content))


In [196]:
# Extract unique words from all documents
unique_words = set()
for doc in preprocessed_documents:
    unique_words.update(doc.split())

# Print unique words extracted from the documents
print("Unique Words:")
print(unique_words)
print('Number Of Unique Words : ', len(unique_words))

Unique Words:

{'raising', 'levante', 'selecting', 'monshausen', 'decadeglobal', 'tyrant', 'exposure', 'barbarense', '2010s', 'mohamed', 'annual', 'explains', 'felt', '120', 'package', 'effective', 'expectation', 'working', 'ensuring', 'premium', 'asymptomatic', '2022present', 'super', 'confirmed', 'provided', 'shulgi', 'allows', 'aged', 'apologiseon', 'borels', 'voluntourism', 'feather', 'luxury', 'abort', 'quarantined', 'statesman', 'brasileirão', 'rory', 'ablebodied', 'thomas', 'modrić', 'card', 'personality', 'analytics', 'martial', 'budapest', 'esports', 'croke', 'renan', 'country', '166', 'fifth', '196768', 'previous', 'discovered', 'parish', 'notably', '33', 'mendes', 'event', 'scheduled', 'englishman', 'emotional', 'leandro', 'bestestablished', 'marc', '14', 'namely', 'hunger', '2018on', 'fostering', 'decade', 'guide', 'booking', 'peter', 'either', 'hattrick', 'cristianos', 'meeting', 'stranded', 'firstteam', 'geographical', 'selfish', '90minute', 'anglais', 'cheat', '1220', 'f

In [197]:
# Calculate TF-IDF using built-in functions (TfidfVectorizer from scikit-learn)
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed documents to obtain the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)

# Get the feature names (unique words) from the TfidfVectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print the shape of the feature names array and the TF-IDF matrix
print('Shape of Feature Names:', feature_names.shape)
print('Shape of TF_IDF Matrix:', tfidf_matrix.shape)

# Print TF-IDF using built-in functions
print("\nTF-IDF using built-in functions:")
print("Feature Names:", feature_names)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


Shape of Feature Names: (5755,)

Shape of TF_IDF Matrix: (4, 5755)



TF-IDF using built-in functions:

Feature Names: ['00' '01' '01612387' ... 'ʁɔˈnaldu' 'τόρνος' '遊記文學']

TF-IDF Matrix:

 [[0.00559003 0.00559003 0.         ... 0.         0.         0.        ]

 [0.         0.         0.00366658 ... 0.         0.00366658 0.00366658]

 [0.         0.         0.         ... 0.00228398 0.         0.        ]

 [0.         0.         0.         ... 0.         0.         0.        ]]


In [198]:
# Create a set to store each document's TF-IDF vectors
document_tfidf_sets = []

# Iterate over each document and its corresponding TF-IDF vector
print("\nTF-IDF using built-in functions:")
for i, doc in enumerate(preprocessed_documents, start=1):
    # Print the TF-IDF Vector for the current document
    print(f"\nTF-IDF Vector for Document {i}:")

    # Create a set to store the TF-IDF vectors for the current document
    tfidf_set = set()

    # Iterate over each word and its corresponding TF-IDF value
    for word_index, word in enumerate(feature_names):
        # Access the TF-IDF value from the TF-IDF matrix
        tfidf_value = tfidf_matrix[i-1, word_index]

        # Check if the TF-IDF value is non-zero
        if tfidf_value > 0:
            # Add the word and its TF-IDF value to the set
            tfidf_set.add((word, tfidf_value))

    # Print the TF-IDF vector for the current document
    print(tfidf_set)

    # Add the TF-IDF set for the current document to the list of document sets
    document_tfidf_sets.append(tfidf_set)





TF-IDF using built-in functions:



TF-IDF Vector for Document 1:

{('touch', 0.006610861572952647), ('june', 0.030328352428554074), ('played', 0.01427216584873133), ('landing', 0.002203620524317549), ('playacting', 0.002795013217204638), ('silva', 0.013221723145905294), ('july', 0.02497629023527983), ('club', 0.13882809303200558), ('logically', 0.002795013217204638), ('international', 0.029171076921830977), ('winner', 0.007136082924365665), ('ballneymars', 0.002795013217204638), ('beber', 0.002795013217204638), ('series', 0.006610861572952647), ('star', 0.024239825767493037), ('equalised', 0.004407241048635098), ('henrique', 0.002795013217204638), ('mirim', 0.002795013217204638), ('million', 0.029171076921830977), ('reported', 0.010209876922640842), ('significant', 0.002203620524317549), ('distraught', 0.002795013217204638), ('cantona', 0.002203620524317549), ('montevideo', 0.002795013217204638), ('tipped', 0.002795013217204638), ('65', 0.0017840207310914162), ('securing', 0.0027950

In [199]:
# Function to calculate term frequency (TF) for a term in a document
def term_frequency(term, document):
    # Count the occurrences of the term in the document
    word_counts = Counter(document.split())
    # Calculate the term frequency (TF) as the ratio of term occurrences to total words in the document
    return word_counts[term] / sum(word_counts.values())

In [200]:
# Function to calculate inverse document frequency (IDF) for a term across all documents
def inverse_document_frequency(term, documents):
    # Count the number of documents that contain the term
    num_documents_with_term = sum(1 for document in documents if term in document)
    # Calculate the IDF using the logarithm of the ratio of total documents to the number of documents containing the term
    return math.log(len(documents) / (1 + num_documents_with_term))

In [201]:
# Function to calculate TF-IDF for a term in a document
def tfidf(term, document, documents):
    # Calculate TF using the term_frequency function
    tf = term_frequency(term, document)
    # Calculate IDF using the inverse_document_frequency function
    idf = inverse_document_frequency(term, documents)
    # Return the product of TF and IDF as the TF-IDF score
    return tf * idf

In [202]:
# Calculate TF-IDF vectors for each document from scratch
tfidf_vectors = []

# Iterate over preprocessed documents to calculate TF-IDF vectors
for doc in preprocessed_documents:
    # Create a dictionary to store TF-IDF values for each term in the document
    doc_tfidf = {}

    # Iterate over each term in the document
    for term in doc.split():
        # Calculate TF-IDF for the current term in the document
        tfidf_value = tfidf(term, doc, preprocessed_documents)
        # Add the term and its corresponding TF-IDF value to the dictionary
        doc_tfidf[term] = tfidf_value

    # Append the TF-IDF vector for the current document to the list
    tfidf_vectors.append(doc_tfidf)

# Print TF-IDF vectors calculated from scratch
print("\nTF-IDF from scratch:")
for i, vector in enumerate(tfidf_vectors, start=1):
    print(f"TF-IDF Vector for Document {i}: {vector}")




TF-IDF from scratch:

TF-IDF Vector for Document 1: {'neymar': 0.011343078335500184, 'da': -0.0001117253980794641, 'silva': 0.00021605863496190826, 'santos': 0.0016924593072016145, 'júnior': 0.00043381348138687276, 'born': 7.201954498730275e-05, '5': -0.0002234507961589282, 'february': 0.0004321172699238165, '1992': 3.6009772493651376e-05, 'also': -0.001061391281754909, 'known': -5.586269903973205e-05, 'brazilian': 0.002429355495766487, 'professional': -0.0001955194466390622, 'footballer': 0.0004681270424174679, 'play': -0.0004469015923178564, 'forward': 0.00021605863496190826, 'saudi': 0.00021605863496190826, 'pro': -0.0001955194466390622, 'league': -0.0018714004178310237, 'club': 0.0022686156671000365, 'al': -0.0001955194466390622, 'hilal': 0.0002520684074555596, 'brazil': -0.0021507139130296843, 'national': -0.0002793134951986603, 'team': -0.000977597233195311, 'regarded': 0.0, 'one': -0.0008379404855959809, 'best': 0.0, 'player': 0.0, 'generation': 0.0, 'renowned': 3.600977249365