### IMPORTING LIBRARIES

In [1]:
import re
import nltk
import sys
import pandas as pd
import spacy
import gensim
from gensim.models import Word2Vec,FastText
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [2]:
# Tokenization (splitting text into words/sentences)
nltk.data.path.append("C:/Users/DELL8/AppData/Roaming/nltk_data")
nltk.download('punkt', download_dir="C:/Users/DELL8/AppData/Roaming/nltk_data")


[nltk_data] Downloading package punkt to
[nltk_data]     C:/Users/DELL8/AppData/Roaming/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('stopwords')  # Removing common words (e.g., "the", "is", "and")
nltk.download('averaged_perceptron_tagger')  # POS tagging (identifying nouns, verbs, adjectives)
nltk.download('wordnet')  # WordNet database for synonyms and lemmatization
nltk.download('maxent_ne_chunker')  # Named Entity Recognition (NER) to identify names/locations
nltk.download('words')  # Dictionary of valid English words (used for spell checking, NER)
nltk.download('omw-1.4')  # Open Multilingual WordNet (support for multiple languages)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already

True

In [4]:
stop_words = set(stopwords.words('english'))

#### READ DATA

In [5]:
# Define file path
file_path = "D:\wiki file\enwiki-latest-abstract3.xml"

text_data = []
with open(file_path, "r", encoding="utf-8") as file:
    for _ in range(10000):  
        text_data.append(file.readline().strip())


#### Cleaning

In [6]:
import re
raw_text = " ".join(text_data)  
clean_text = re.sub(r"<.*?>", " ", raw_text)  
clean_text = re.sub(r"https?://\S+", " ", clean_text) 
clean_text = re.sub(r"[^a-zA-Z\s]", " ", clean_text)  




In [7]:


print("Raw Text (First 500 chars):", clean_text[:500])


Raw Text (First 500 chars):      Wikipedia  Diego Maradona stadium       Diego Maradona stadium can refer to       All article disambiguation pages        All disambiguation pages        Place name disambiguation pages        Short description is different from Wikidata             Wikipedia  White Stone       White Stone may refer to       See also             Wikipedia  Yes Tor         grid ref UK   SX            In popular culture        References        External links             Wikipedia  Watermelon Man  composition


#### Stopword Removal

#### Definition 
Stopword removal eliminates common words (e.g., "the," "and," "is") that add little meaning to text analysis, improving model efficiency. This helps focus on informative words, enhancing the accuracy of NLP tasks like vectorization and classification.


In [12]:
original_texts = []
stopword_removed_texts = []

with open(file_path, "r", encoding="utf-8") as file:
    for _ in range(10000):  
        line = file.readline().strip()

        clean_line = re.sub(r"<.*?>", " ", line)
        clean_line = re.sub(r"https?://\S+", " ", clean_line)
        clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line)

        words = clean_line.lower().split()

        if not words:
            continue

        # Stopword Removal
        filtered_words = [word for word in words if word not in stop_words]
        
        original_texts.append(clean_line)
        stopword_removed_texts.append(" ".join(filtered_words))

In [13]:
# Create a DataFrame
df_stopwords = pd.DataFrame({"Original Text": original_texts, "Without Stopwords": stopword_removed_texts})
df_stopwords.head()


Unnamed: 0,Original Text,Without Stopwords
0,Wikipedia Diego Maradona stadium,wikipedia diego maradona stadium
1,Diego Maradona stadium can refer to,diego maradona stadium refer
2,All article disambiguation pages,article disambiguation pages
3,All disambiguation pages,disambiguation pages
4,Place name disambiguation pages,place name disambiguation pages


In [14]:
all_cleaned_text = " ".join(stopword_removed_texts)
word_counts = Counter(all_cleaned_text.split())
df_word_freq = pd.DataFrame(word_counts.items(), columns=["Word", "Score"]).sort_values(by="Score", ascending=False)
df_word_freq.head(10)

Unnamed: 0,Word,Score
0,wikipedia,732
27,references,616
28,external,404
29,links,404
18,also,237
17,see,213
240,history,150
195,career,122
8,place,115
425,birth,107


### Observation
Stopword removal filters out common words like **"the", "is", "in"**, making text **more meaningful and concise**. For example,  
 **Before:** *"The Diego Maradona stadium is used for international football matches."*  
 **After:** *"diego maradona stadium used international football"*  

This process **removes noise**, keeps **important words**, and improves **text analysis for NLP and machine learning**.

####  WordNet

#### **WordNet Definition:**  
WordNet is a lexical database for the English language that groups words into sets of synonyms called synsets, providing short definitions and usage examples. It captures relationships between words, such as synonyms, antonyms, hypernyms, and hyponyms, making it useful for natural language processing (NLP) tasks like text analysis, word similarity, and language modeling.

In [15]:
original_texts = []
wordnet_transformed = []

with open(file_path, "r", encoding="utf-8") as file:
    for _ in range(10000):  
        line = file.readline().strip()

        clean_line = re.sub(r"<.*?>", " ", line)
        clean_line = re.sub(r"https?://\S+", " ", clean_line)
        clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line)

        words = clean_line.lower().split()

        if not words:
            continue

        synonyms = []
        for word in words:
            syns = wordnet.synsets(word)
            if syns:
                synonyms.append(syns[0].lemmas()[0].name()) 
            else:
                synonyms.append(word)


        original_texts.append(clean_line)
        wordnet_transformed.append(" ".join(synonyms))




In [16]:
df_wordnet = pd.DataFrame({"Original Text": original_texts, "WordNet Synonyms": wordnet_transformed})
df_wordnet.head(10)

Unnamed: 0,Original Text,WordNet Synonyms
0,Wikipedia Diego Maradona stadium,wikipedia diego maradona stadium
1,Diego Maradona stadium can refer to,diego maradona stadium can mention to
2,All article disambiguation pages,all article disambiguation page
3,All disambiguation pages,all disambiguation page
4,Place name disambiguation pages,topographic_point name disambiguation page
5,Short description is different from Wikidata...,short description be different from wikidata
6,Wikipedia White Stone,wikipedia White rock
7,White Stone may refer to,White rock May mention to
8,See also,see besides
9,Wikipedia Yes Tor,wikipedia yes tor


In [17]:
all_wordnet_text = " ".join(wordnet_transformed)
word_counts = Counter(all_wordnet_text.split())

df_wordnet_freq = pd.DataFrame(word_counts.items(), columns=["Word", "Score"]).sort_values(by="Score", ascending=False)
df_wordnet_freq.head(10)

Unnamed: 0,Word,Score
64,the,811
0,wikipedia,732
5,mention,667
72,of,603
105,and,433
34,links,404
33,external,404
30,inch,373
44,angstrom,321
15,be,272


#### Observation
The WordNet-based synonym replacement process in the code refines text preprocessing by standardizing word representations. It first cleans raw text by removing XML tags, URLs, and non-alphabetic characters, ensuring only meaningful words are processed. Each word is then mapped to its most relevant synonym using WordNet, replacing it when a synonym exists while retaining the original word if no match is found. This transformation reduces vocabulary complexity, enhances semantic consistency, and improves the performance of NLP and machine learning models in tasks like sentiment analysis, text classification, and search optimization by ensuring similar words are treated uniformly.

#### lemmatizer

##### **Lemmatizer Definition:**  
A **lemmatizer** is a tool in natural language processing (NLP) that reduces words to their base or dictionary form (lemma) by considering the word’s meaning and context. Unlike stemming, which simply removes suffixes, lemmatization ensures that the root word is a valid English word. It is commonly used in text preprocessing for machine learning and NLP applications.

In [21]:
lemmatizer = WordNetLemmatizer()
original_texts = []
lemmatized_texts = []

for line in text_data:
    clean_line = re.sub(r"<.*?>", " ", line)
    clean_line = re.sub(r"https?://\S+", " ", clean_line)
    clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line)

    words = clean_line.lower().split()

    if not words:
        continue

    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    original_texts.append(clean_line)
    lemmatized_texts.append(" ".join(lemmatized_words))




In [22]:
df_lemmatized = pd.DataFrame({"Original Text": original_texts, "Lemmatized Text": lemmatized_texts})
df_lemmatized.head()

Unnamed: 0,Original Text,Lemmatized Text
0,Wikipedia Diego Maradona stadium,wikipedia diego maradona stadium
1,Diego Maradona stadium can refer to,diego maradona stadium can refer to
2,All article disambiguation pages,all article disambiguation page
3,All disambiguation pages,all disambiguation page
4,Place name disambiguation pages,place name disambiguation page


In [23]:
all_lemmatized_text = " ".join(lemmatized_texts)
word_counts = Counter(all_lemmatized_text.split())

df_lemmatized_freq = pd.DataFrame(word_counts.items(), columns=["Word", "Score"]).sort_values(by="Score", ascending=False)
df_lemmatized_freq.head(10)

Unnamed: 0,Word,Score
65,the,811
0,wikipedia,732
33,reference,618
73,of,603
106,and,433
35,link,409
34,external,404
45,a,392
30,in,372
15,is,250


In [24]:
lemmatized_full_text = " ".join(lemmatized_texts)
print("Lemmatized Text (First 500 chars):", lemmatized_full_text[:500])

Lemmatized Text (First 500 chars): wikipedia diego maradona stadium diego maradona stadium can refer to all article disambiguation page all disambiguation page place name disambiguation page short description is different from wikidata wikipedia white stone white stone may refer to see also wikipedia yes tor grid ref uk sx in popular culture reference external link wikipedia watermelon man composition length herbie hancock version mongo santamar a version chart performance herbie hancock version other version sample personnel ref


#### observation
Lemmatization significantly enhances text processing by converting words to their base or dictionary form while preserving their meanings. Unlike stemming, which often trims words without considering context, lemmatization ensures grammatical correctness. In the processed Wikipedia dataset, words like *running* become *run* and *better* becomes *good*, improving consistency for NLP tasks. The frequency distribution also changes, as different word variations merge into a single root form, reducing redundancy. This is crucial in machine learning applications like text classification and sentiment analysis, where standardized vocabulary enhances model accuracy and efficiency.

#### count_vectorizer

##### **Count Vectorizer Definition:**  
A **Count Vectorizer** is a technique in Natural Language Processing (NLP) that converts text data into a matrix of token counts. It represents each document as a vector, where each feature corresponds to a unique word, and the value represents the word's frequency in the document. This method is widely used in text classification, sentiment analysis, and other machine learning applications involving textual data.

In [25]:
cleaned_text = []
for line in text_data:
    clean_line = re.sub(r"<.*?>", " ", line)
    clean_line = re.sub(r"https?://\S+", " ", clean_line)
    clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line)
    cleaned_text.append(clean_line)

final_clean_text = " ".join(cleaned_text)
count_vectorizer = CountVectorizer(max_features=10000)
count_vectors = count_vectorizer.fit_transform([final_clean_text])

feature_names = count_vectorizer.get_feature_names_out()
word_frequencies = count_vectors.toarray().flatten()
vectorized_df = pd.DataFrame(count_vectors.toarray(), columns=feature_names)


In [26]:
feature_names[:10]

array(['ab', 'abandoned', 'abandonment', 'abapeba', 'abba', 'abbreviated',
       'abc', 'abd', 'abduction', 'aberdeen'], dtype=object)

In [27]:
vectorized_df[:10]

Unnamed: 0,ab,abandoned,abandonment,abapeba,abba,abbreviated,abc,abd,abduction,aberdeen,...,zeus,ziffer,zip,zone,zong,zophorame,zophoryctes,zoua,zubayrids,zuleta
0,1,1,1,1,2,2,1,7,1,1,...,2,2,1,1,1,1,1,1,1,1


In [28]:
score_df = pd.DataFrame({"Word": feature_names, "score": word_frequencies})
score_df = score_df.sort_values(by="score", ascending=False)
score_df.head(10)

Unnamed: 0,Word,score
5200,the,811
5681,wikipedia,732
4329,references,616
3641,of,603
238,and,433
2973,links,404
1877,external,404
2540,in,372
2658,is,250
182,also,237


#### Observation
The Count Vectorizer converts cleaned text into a numerical format by counting word occurrences. It extracts the **top 10,000 most frequent words**, creating a matrix where each column represents a word and its frequency. The most common words are selected based on occurrence, making it useful for **text classification, topic modeling, and sentiment analysis** in machine learning.

#### Tf-Idf_vectorizer

**TF-IDF Vectorizer:** It transforms text data into numerical representation by computing Term Frequency-Inverse Document Frequency (TF-IDF) scores, highlighting important words while reducing the weight of common words, making it useful for text mining and information retrieval.

In [49]:
cleaned_text = []
for line in text_data:
    clean_line = re.sub(r"<.*?>", " ", line) 
    clean_line = re.sub(r"https?://\S+", " ", clean_line)  
    clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line)  
    cleaned_text.append(clean_line.lower())  

final_clean_text = " ".join(cleaned_text)

tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectors = tfidf_vectorizer.fit_transform([final_clean_text])

feature_names1 = tfidf_vectorizer.get_feature_names_out()
word_frequencies = tfidf_vectors.toarray().flatten()
vectorized_df = pd.DataFrame(tfidf_vectors.toarray(), columns=feature_names)

In [50]:
feature_names1[:10]

array(['ab', 'abandoned', 'abandonment', 'abapeba', 'abba', 'abbreviated',
       'abc', 'abd', 'abduction', 'aberdeen'], dtype=object)

In [51]:
vectorized_df

Unnamed: 0,ab,abandoned,abandonment,abapeba,abba,abbreviated,abc,abd,abduction,aberdeen,...,zeus,ziffer,zip,zone,zong,zophorame,zophoryctes,zoua,zubayrids,zuleta
0,0.000569,0.000569,0.000569,0.000569,0.001138,0.001138,0.000569,0.003984,0.000569,0.000569,...,0.001138,0.001138,0.000569,0.000569,0.000569,0.000569,0.000569,0.000569,0.000569,0.000569


In [52]:
value_df = pd.DataFrame({"Word": feature_names1, "score": word_frequencies})
value_df= score_df.sort_values(by="score", ascending=False)
value_df.head(10)

Unnamed: 0,Word,score
5200,the,811
5681,wikipedia,732
4329,references,616
3641,of,603
238,and,433
2973,links,404
1877,external,404
2540,in,372
2658,is,250
182,also,237


#### Observation 
The TF-IDF vectorization process transforms text data into numerical representations by weighting word importance based on term frequency (TF) and inverse document frequency (IDF). Unlike Count Vectorization, which represents raw word counts as integers, TF-IDF normalizes these counts, resulting in decimal values that highlight informative words while down-weighting common ones. If many words have similar TF-IDF scores, it indicates uniform distribution across the dataset. To improve variation, multiple documents should be used instead of a single merged text, and `max_df` and `min_df` parameters can help filter overly common or rare terms, enhancing feature significance for machine learning models.

#### Word2Vec CBOW Model

##### **Define**:
A neural network-based word embedding technique that predicts a target word from its surrounding context, generating dense vectors that capture semantic relationships in text data.

In [30]:
cleaned_text = []
for line in text_data:
    clean_line = re.sub(r"<.*?>", " ", line)  
    clean_line = re.sub(r"https?://\S+", " ", clean_line)  
    clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line) 
    cleaned_text.append(clean_line.lower())  

tokenized_sentences = [sentence.split() for sentence in cleaned_text if sentence]

word2vec_cbow = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2, workers=4, sg=0)

word2vec_cbow.save("word2vec_cbow.model")

vocabulary_size = len(word2vec_cbow.wv)
vocabulary_words = list(word2vec_cbow.wv.index_to_key)[:10]


In [31]:
vocabulary_size 

2284

In [32]:
vocabulary_words[:10]

['the',
 'wikipedia',
 'references',
 'of',
 'and',
 'links',
 'external',
 'in',
 'a',
 'is']

In [33]:

word_to_check = "wikipedia"  # Choose a word from vocabulary
if word_to_check in word2vec_cbow.wv:
     similar_words = word2vec_cbow.wv.most_similar(word_to_check, topn=5)
     similar_words
else:
     word_to_check




In [34]:
similar_words

[('of', 0.9993048906326294),
 ('the', 0.9992998838424683),
 ('and', 0.9992989897727966),
 ('a', 0.9992642402648926),
 ('in', 0.999263346195221)]

In [35]:
word_vectors = {word: word2vec_cbow.wv[word] for word in word2vec_cbow.wv.index_to_key}
df_word2vec = pd.DataFrame.from_dict(word_vectors, orient='index')

In [36]:
df_word2vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.38539,0.285547,0.266091,0.495479,-0.089024,-1.021536,0.430143,1.147418,-0.467211,-0.273426,...,0.188782,-0.107095,0.078021,0.025839,0.796384,0.397781,-0.081653,-0.330557,0.025315,0.156305
wikipedia,-0.175684,0.127861,0.119228,0.215319,-0.027707,-0.448489,0.184707,0.501727,-0.20397,-0.126046,...,0.084776,-0.04857,0.033271,0.00413,0.339736,0.174038,-0.026366,-0.14183,-0.000153,0.068599
references,-0.000344,0.003415,-0.006516,-0.000882,0.007603,0.00632,-0.003209,0.003781,-0.008775,0.005956,...,-0.004288,0.005623,0.009276,-0.004102,0.008772,0.005767,0.005803,0.000201,0.008236,-0.00684
of,-0.300644,0.226226,0.199073,0.366939,-0.055515,-0.77659,0.325622,0.87358,-0.344989,-0.213739,...,0.135479,-0.082839,0.057388,0.01609,0.60898,0.297687,-0.048295,-0.251821,0.017238,0.110806
and,-0.300356,0.219499,0.191082,0.370471,-0.056502,-0.772723,0.326256,0.875517,-0.358209,-0.200261,...,0.150876,-0.088963,0.062072,0.016601,0.606711,0.293165,-0.048528,-0.241582,0.017149,0.118267


#### Observation
The **Word2Vec CBOW model** efficiently learns word representations by predicting target words from surrounding context, mapping words to numerical vectors while preserving semantic meaning. When trained on Wikipedia data, it captures relationships between words, making it useful for **synonym detection, sentiment analysis, topic modeling, and recommendation systems**. Frequent words like "the" and "of" may have similar vectors due to shared contexts, revealing linguistic patterns that enhance **document similarity analysis and other NLP applications**.

#### FastText Model

**Define:** An extension of Word2Vec that represents words as subword n-grams, allowing it to handle rare and misspelled words better. It is useful for tasks like text classification, word embeddings, and handling out-of-vocabulary words in NLP.

In [37]:
cleaned_text = []
for line in text_data:
    clean_line = re.sub(r"<.*?>", " ", line)  
    clean_line = re.sub(r"https?://\S+", " ", clean_line)  
    clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line) 
    cleaned_text.append(clean_line.lower())  

tokenized_sentences = [sentence.split() for sentence in cleaned_text if sentence]
fasttext_model = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2, workers=4)
fasttext_model.save("fasttext.model")

vocabulary_size = len(fasttext_model.wv)
vocabulary_words = list(fasttext_model.wv.index_to_key)[:10] 

In [38]:
vocabulary_size

2284

In [39]:
vocabulary_words

['the',
 'wikipedia',
 'references',
 'of',
 'and',
 'links',
 'external',
 'in',
 'a',
 'is']

In [40]:
word_to_check = "wikipedia"
if word_to_check in fasttext_model.wv:
     similar_words = fasttext_model.wv.most_similar(word_to_check, topn=5)
     similar_words
else:
     word_to_check

In [41]:
similar_words

[('international', 0.9999849796295166),
 ('station', 0.9999843835830688),
 ('national', 0.9999843239784241),
 ('publication', 0.999984085559845),
 ('expedition', 0.9999829530715942)]

In [42]:
word_vectors = {word: fasttext_model.wv[word] for word in fasttext_model.wv.index_to_key}
df_fasttext = pd.DataFrame.from_dict(word_vectors, orient='index')
df_fasttext.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.535637,1.147564,-0.591173,0.477393,0.052769,0.085065,0.990745,0.172422,0.720564,-1.208794,...,-0.95011,-0.023041,0.112896,0.405702,-0.778697,0.887925,-0.171397,-0.663371,0.209901,0.448828
wikipedia,-0.21107,0.454074,-0.232049,0.187324,0.02059,0.033715,0.389426,0.066042,0.284584,-0.479298,...,-0.377023,-0.006668,0.043672,0.160568,-0.307041,0.351139,-0.069604,-0.259559,0.082919,0.177189
references,-0.13461,0.285926,-0.147685,0.120408,0.014089,0.022382,0.248421,0.045221,0.17911,-0.302662,...,-0.23971,-0.003927,0.028201,0.102699,-0.196141,0.224706,-0.042293,-0.165619,0.052345,0.112619
of,-0.37752,0.814007,-0.418979,0.333845,0.045157,0.061924,0.695783,0.123879,0.51343,-0.85387,...,-0.674944,-0.012907,0.081634,0.287607,-0.547476,0.629127,-0.122063,-0.469286,0.147936,0.313336
and,-0.428046,0.911175,-0.464959,0.379916,0.042159,0.072153,0.78229,0.132726,0.571708,-0.951361,...,-0.753387,-0.018835,0.092998,0.324872,-0.612457,0.706031,-0.137475,-0.519714,0.1666,0.356908


#### observation
FastText is trained on tokenized sentences from Wikipedia data, creating dense word embeddings with **100-dimensional vectors**. The model considers **subword units (character n-grams)**, allowing it to handle out-of-vocabulary (OOV) words better than Word2Vec. The trained model is saved as `"fasttext.model"`, and key outputs include the **vocabulary size**, **sample words**, and **most similar words** to `"wikipedia"`. Since FastText captures morphological similarities, it helps in NLP tasks like **text classification, information retrieval, and named entity recognition (NER)** by generating meaningful word representations even for unseen words.