### IMPORTING LIBRARIES

In [158]:
import re
import nltk
import sys
import pandas as pd
import spacy
import gensim
from gensim.models import Word2Vec,FastText
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [159]:
# Tokenization (splitting text into words/sentences)
nltk.data.path.append("C:/Users/DELL8/AppData/Roaming/nltk_data")
nltk.download('punkt', download_dir="C:/Users/DELL8/AppData/Roaming/nltk_data")


[nltk_data] Downloading package punkt to
[nltk_data]     C:/Users/DELL8/AppData/Roaming/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [160]:
nltk.download('stopwords')  # Removing common words (e.g., "the", "is", "and")
nltk.download('averaged_perceptron_tagger')  # POS tagging (identifying nouns, verbs, adjectives)
nltk.download('wordnet')  # WordNet database for synonyms and lemmatization
nltk.download('maxent_ne_chunker')  # Named Entity Recognition (NER) to identify names/locations
nltk.download('words')  # Dictionary of valid English words (used for spell checking, NER)
nltk.download('omw-1.4')  # Open Multilingual WordNet (support for multiple languages)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already

True

In [161]:
stop_words = set(stopwords.words('english'))

#### READ DATA

In [162]:
# Define file path
file_path = "D:\wiki file\enwiki-latest-abstract3.xml"

text_data = []
with open(file_path, "r", encoding="utf-8") as file:
    for _ in range(10000):  
        text_data.append(file.readline().strip())


#### Cleaning

In [163]:
import re
raw_text = " ".join(text_data)  
clean_text = re.sub(r"<.*?>", " ", raw_text)  
clean_text = re.sub(r"https?://\S+", " ", clean_text) 
clean_text = re.sub(r"[^a-zA-Z\s]", " ", clean_text)  




In [164]:


print("Raw Text (First 500 chars):", clean_text[:500])


#### Stopword Removal

In [165]:
words = text.lower().split()  # Simple whitespace-based tokenization
stop_words = set(stopwords.words('english'))  # Load stopwords
filtered_words = [word for word in words if word.isalnum() and word not in stop_words] 

In [166]:
original_texts = []
stopword_removed_texts = []

with open(file_path, "r", encoding="utf-8") as file:
    for _ in range(10000):  
        line = file.readline().strip()

        clean_line = re.sub(r"<.*?>", " ", line)
        clean_line = re.sub(r"https?://\S+", " ", clean_line)
        clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line)

        words = clean_line.lower().split()

        if not words:
            continue

        # Stopword Removal
        filtered_words = [word for word in words if word not in stop_words]
        
        original_texts.append(clean_line)
        stopword_removed_texts.append(" ".join(filtered_words))

In [167]:
# Create a DataFrame
df_stopwords = pd.DataFrame({"Original Text": original_texts, "Without Stopwords": stopword_removed_texts})
df_stopwords.head()


Unnamed: 0,Original Text,Without Stopwords
0,Wikipedia Diego Maradona stadium,wikipedia diego maradona stadium
1,Diego Maradona stadium can refer to,diego maradona stadium refer
2,All article disambiguation pages,article disambiguation pages
3,All disambiguation pages,disambiguation pages
4,Place name disambiguation pages,place name disambiguation pages


In [168]:
all_cleaned_text = " ".join(stopword_removed_texts)
word_counts = Counter(all_cleaned_text.split())
df_word_freq = pd.DataFrame(word_counts.items(), columns=["Word", "Score"]).sort_values(by="Score", ascending=False)
df_word_freq.head(10)

Unnamed: 0,Word,Score
0,wikipedia,732
27,references,616
28,external,404
29,links,404
18,also,237
17,see,213
240,history,150
195,career,122
8,place,115
425,birth,107


### Observation
Stopword removal filters out common words like **"the", "is", "in"**, making text **more meaningful and concise**. For example,  
 **Before:** *"The Diego Maradona stadium is used for international football matches."*  
 **After:** *"diego maradona stadium used international football"*  

This process **removes noise**, keeps **important words**, and improves **text analysis for NLP and machine learning**.

####  WordNet

In [169]:
original_texts = []
wordnet_transformed = []

with open(file_path, "r", encoding="utf-8") as file:
    for _ in range(10000):  
        line = file.readline().strip()

        clean_line = re.sub(r"<.*?>", " ", line)
        clean_line = re.sub(r"https?://\S+", " ", clean_line)
        clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line)

        words = clean_line.lower().split()

        if not words:
            continue

        synonyms = []
        for word in words:
            syns = wordnet.synsets(word)
            if syns:
                synonyms.append(syns[0].lemmas()[0].name()) 
            else:
                synonyms.append(word)


        original_texts.append(clean_line)
        wordnet_transformed.append(" ".join(synonyms))




In [170]:
df_wordnet = pd.DataFrame({"Original Text": original_texts, "WordNet Synonyms": wordnet_transformed})
df_wordnet.head(10)

Unnamed: 0,Original Text,WordNet Synonyms
0,Wikipedia Diego Maradona stadium,wikipedia diego maradona stadium
1,Diego Maradona stadium can refer to,diego maradona stadium can mention to
2,All article disambiguation pages,all article disambiguation page
3,All disambiguation pages,all disambiguation page
4,Place name disambiguation pages,topographic_point name disambiguation page
5,Short description is different from Wikidata...,short description be different from wikidata
6,Wikipedia White Stone,wikipedia White rock
7,White Stone may refer to,White rock May mention to
8,See also,see besides
9,Wikipedia Yes Tor,wikipedia yes tor


In [171]:
all_wordnet_text = " ".join(wordnet_transformed)
word_counts = Counter(all_wordnet_text.split())

df_wordnet_freq = pd.DataFrame(word_counts.items(), columns=["Word", "Score"]).sort_values(by="Score", ascending=False)
df_wordnet_freq.head(10)

Unnamed: 0,Word,Score
64,the,811
0,wikipedia,732
5,mention,667
72,of,603
105,and,433
34,links,404
33,external,404
30,inch,373
44,angstrom,321
15,be,272


#### Observation
The WordNet-based synonym replacement process in the code refines text preprocessing by standardizing word representations. It first cleans raw text by removing XML tags, URLs, and non-alphabetic characters, ensuring only meaningful words are processed. Each word is then mapped to its most relevant synonym using WordNet, replacing it when a synonym exists while retaining the original word if no match is found. This transformation reduces vocabulary complexity, enhances semantic consistency, and improves the performance of NLP and machine learning models in tasks like sentiment analysis, text classification, and search optimization by ensuring similar words are treated uniformly.

#### lemmatizer

In [172]:
original_texts = []
lemmatized_texts = []

for line in text_data:
    clean_line = re.sub(r"<.*?>", " ", line)
    clean_line = re.sub(r"https?://\S+", " ", clean_line)
    clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line)

    words = clean_line.lower().split()

    if not words:
        continue

    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    original_texts.append(clean_line)
    lemmatized_texts.append(" ".join(lemmatized_words))




In [173]:
df_lemmatized = pd.DataFrame({"Original Text": original_texts, "Lemmatized Text": lemmatized_texts})
df_lemmatized.head()

Unnamed: 0,Original Text,Lemmatized Text
0,Wikipedia Diego Maradona stadium,wikipedia diego maradona stadium
1,Diego Maradona stadium can refer to,diego maradona stadium can refer to
2,All article disambiguation pages,all article disambiguation page
3,All disambiguation pages,all disambiguation page
4,Place name disambiguation pages,place name disambiguation page


In [174]:
all_lemmatized_text = " ".join(lemmatized_texts)
word_counts = Counter(all_lemmatized_text.split())

df_lemmatized_freq = pd.DataFrame(word_counts.items(), columns=["Word", "Score"]).sort_values(by="Score", ascending=False)
df_lemmatized_freq.head(10)

Unnamed: 0,Word,Score
65,the,811
0,wikipedia,732
33,reference,618
73,of,603
106,and,433
35,link,409
34,external,404
45,a,392
30,in,372
15,is,250


In [175]:
lemmatized_full_text = " ".join(lemmatized_texts)
print("Lemmatized Text (First 500 chars):", lemmatized_full_text[:500])

#### observation
Lemmatization significantly enhances text processing by converting words to their base or dictionary form while preserving their meanings. Unlike stemming, which often trims words without considering context, lemmatization ensures grammatical correctness. In the processed Wikipedia dataset, words like *running* become *run* and *better* becomes *good*, improving consistency for NLP tasks. The frequency distribution also changes, as different word variations merge into a single root form, reducing redundancy. This is crucial in machine learning applications like text classification and sentiment analysis, where standardized vocabulary enhances model accuracy and efficiency.

#### count_vectorizer

In [176]:
cleaned_text = []
for line in text_data:
    clean_line = re.sub(r"<.*?>", " ", line)
    clean_line = re.sub(r"https?://\S+", " ", clean_line)
    clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line)
    cleaned_text.append(clean_line)

final_clean_text = " ".join(cleaned_text)
count_vectorizer = CountVectorizer(max_features=10000)
count_vectors = count_vectorizer.fit_transform([final_clean_text])

feature_names = count_vectorizer.get_feature_names_out()
word_frequencies = count_vectors.toarray().flatten()
vectorized_df = pd.DataFrame(count_vectors.toarray(), columns=feature_names)


In [177]:
feature_names[:10]

array(['ab', 'abandoned', 'abandonment', 'abapeba', 'abba', 'abbreviated',
       'abc', 'abd', 'abduction', 'aberdeen'], dtype=object)

In [178]:
vectorized_df[:10]

Unnamed: 0,ab,abandoned,abandonment,abapeba,abba,abbreviated,abc,abd,abduction,aberdeen,...,zeus,ziffer,zip,zone,zong,zophorame,zophoryctes,zoua,zubayrids,zuleta
0,1,1,1,1,2,2,1,7,1,1,...,2,2,1,1,1,1,1,1,1,1


In [179]:
score_df = pd.DataFrame({"Word": feature_names, "score": word_frequencies})
score_df = score_df.sort_values(by="score", ascending=False)
score_df.head(10)

Unnamed: 0,Word,score
5200,the,811
5681,wikipedia,732
4329,references,616
3641,of,603
238,and,433
2973,links,404
1877,external,404
2540,in,372
2658,is,250
182,also,237


#### Observation
The Count Vectorizer converts cleaned text into a numerical format by counting word occurrences. It extracts the **top 10,000 most frequent words**, creating a matrix where each column represents a word and its frequency. The most common words are selected based on occurrence, making it useful for **text classification, topic modeling, and sentiment analysis** in machine learning.

In [180]:
print("Sample words in vocabulary:", list(word2vec_cbow.wv.index_to_key)[:20])


#### Word2Vec CBOW Model

In [181]:
cleaned_text = []
for line in text_data:
    clean_line = re.sub(r"<.*?>", " ", line)  
    clean_line = re.sub(r"https?://\S+", " ", clean_line)  
    clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line) 
    cleaned_text.append(clean_line.lower())  

tokenized_sentences = [sentence.split() for sentence in cleaned_text if sentence]

word2vec_cbow = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2, workers=4, sg=0)

word2vec_cbow.save("word2vec_cbow.model")

vocabulary_size = len(word2vec_cbow.wv)
vocabulary_words = list(word2vec_cbow.wv.index_to_key)[:10]


In [182]:
vocabulary_size 

2284

In [183]:
vocabulary_words[:10]

['the',
 'wikipedia',
 'references',
 'of',
 'and',
 'links',
 'external',
 'in',
 'a',
 'is']

In [184]:

word_to_check = "wikipedia"  # Choose a word from vocabulary
if word_to_check in word2vec_cbow.wv:
     similar_words = word2vec_cbow.wv.most_similar(word_to_check, topn=5)
     similar_words
else:
     word_to_check




In [185]:
similar_words

[('of', 0.9993590116500854),
 ('the', 0.9993279576301575),
 ('a', 0.999306857585907),
 ('was', 0.999255359172821),
 ('and', 0.9992365837097168)]

In [186]:
word_vectors = {word: word2vec_cbow.wv[word] for word in word2vec_cbow.wv.index_to_key}
df_word2vec = pd.DataFrame.from_dict(word_vectors, orient='index')

In [187]:
df_word2vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.505877,0.374658,0.112767,0.489854,-0.061394,-1.002207,0.488773,1.199319,-0.506801,-0.381399,...,0.326439,-0.005702,0.250999,0.170419,0.904084,0.334856,-0.086891,-0.24946,0.066714,-0.098359
wikipedia,-0.225289,0.165498,0.051778,0.211769,-0.015507,-0.4348,0.207854,0.519172,-0.21864,-0.169131,...,0.141831,-0.00386,0.106328,0.065268,0.381819,0.143996,-0.027592,-0.106726,0.017837,-0.039243
references,-2.4e-05,0.003191,-0.006743,-0.001256,0.007688,0.007161,-0.003605,0.002889,-0.008432,0.006089,...,-0.00445,0.005693,0.009195,-0.004097,0.008153,0.005453,0.005871,0.000409,0.008194,-0.007019
of,-0.47954,0.357905,0.100746,0.447843,-0.042915,-0.933549,0.450844,1.118255,-0.459176,-0.360654,...,0.295823,-0.008338,0.230432,0.15681,0.844464,0.306429,-0.06404,-0.235905,0.054583,-0.101981
and,-0.384057,0.280466,0.071057,0.356747,-0.034922,-0.738154,0.361695,0.89187,-0.37683,-0.273822,...,0.247538,-0.01039,0.191689,0.124068,0.671807,0.237048,-0.050297,-0.173161,0.04927,-0.075054


#### Observation
The **Word2Vec CBOW model** efficiently learns word representations by predicting target words from surrounding context, mapping words to numerical vectors while preserving semantic meaning. When trained on Wikipedia data, it captures relationships between words, making it useful for **synonym detection, sentiment analysis, topic modeling, and recommendation systems**. Frequent words like "the" and "of" may have similar vectors due to shared contexts, revealing linguistic patterns that enhance **document similarity analysis and other NLP applications**.

#### FastText Model

In [191]:
cleaned_text = []
for line in text_data:
    clean_line = re.sub(r"<.*?>", " ", line)  
    clean_line = re.sub(r"https?://\S+", " ", clean_line)  
    clean_line = re.sub(r"[^a-zA-Z\s]", " ", clean_line) 
    cleaned_text.append(clean_line.lower())  

tokenized_sentences = [sentence.split() for sentence in cleaned_text if sentence]
fasttext_model = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2, workers=4)
fasttext_model.save("fasttext.model")

vocabulary_size = len(fasttext_model.wv)
vocabulary_words = list(fasttext_model.wv.index_to_key)[:10] 

In [192]:
vocabulary_size

2284

In [193]:
vocabulary_words

['the',
 'wikipedia',
 'references',
 'of',
 'and',
 'links',
 'external',
 'in',
 'a',
 'is']

In [206]:
word_to_check = "wikipedia"
if word_to_check in fasttext_model.wv:
     similar_words = fasttext_model.wv.most_similar(word_to_check, topn=5)
     similar_words
else:
     word_to_check

In [207]:
similar_words

[('international', 0.9999842047691345),
 ('national', 0.999984085559845),
 ('station', 0.9999839663505554),
 ('expedition', 0.9999829530715942),
 ('publication', 0.9999828934669495)]

In [208]:
word_vectors = {word: fasttext_model.wv[word] for word in fasttext_model.wv.index_to_key}
df_fasttext = pd.DataFrame.from_dict(word_vectors, orient='index')
df_fasttext.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.549158,1.143553,-0.596456,0.486956,0.060507,0.095754,1.004807,0.168418,0.7275,-1.229129,...,-0.956318,-0.025237,0.109406,0.396868,-0.791044,0.891996,-0.192547,-0.655607,0.206663,0.42899
wikipedia,-0.204623,0.42792,-0.221578,0.18083,0.022302,0.035797,0.373685,0.061046,0.271519,-0.460898,...,-0.359003,-0.007091,0.040103,0.14896,-0.295369,0.333775,-0.073865,-0.242573,0.077421,0.160182
references,-0.130237,0.268477,-0.140577,0.11567,0.015091,0.023505,0.237676,0.041854,0.170432,-0.290017,...,-0.227372,-0.00418,0.025941,0.094831,-0.187895,0.212854,-0.044905,-0.15446,0.048603,0.101314
of,-0.429241,0.898569,-0.468397,0.378014,0.055372,0.077033,0.782779,0.13399,0.574239,-0.962228,...,-0.752277,-0.016287,0.088254,0.311832,-0.61709,0.700275,-0.15161,-0.513414,0.160595,0.332671
and,-0.458993,0.949453,-0.490593,0.405358,0.050956,0.084037,0.829974,0.135896,0.603638,-1.012258,...,-0.792857,-0.021676,0.09444,0.332525,-0.651415,0.742039,-0.161193,-0.537828,0.171364,0.356782


#### observation
FastText is trained on tokenized sentences from Wikipedia data, creating dense word embeddings with **100-dimensional vectors**. The model considers **subword units (character n-grams)**, allowing it to handle out-of-vocabulary (OOV) words better than Word2Vec. The trained model is saved as `"fasttext.model"`, and key outputs include the **vocabulary size**, **sample words**, and **most similar words** to `"wikipedia"`. Since FastText captures morphological similarities, it helps in NLP tasks like **text classification, information retrieval, and named entity recognition (NER)** by generating meaningful word representations even for unseen words.