Tokenization

In [7]:
import os
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Download nltk tokenizer if not already installed
nltk.download('punkt')

# Path to the folder containing the text files
folder_path = 'Data'

# Initialize variables to store tokens for the entire corpus
corpus_tokens = []
corpus_unique_tokens = set()

# Function to read and tokenize a single document
def tokenize_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        # Tokenize the content
        tokens = word_tokenize(content)
        return tokens

# Process each document in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        tokens = tokenize_document(file_path)
        
        # Count tokens and unique tokens for each document
        num_tokens = len(tokens)
        num_unique_tokens = len(set(tokens))
        print(f"Document: {filename}")
        print(f"  Number of tokens: {num_tokens}")
        print(f"  Number of unique tokens: {num_unique_tokens}\n")
        
        # Add tokens to the corpus
        corpus_tokens.extend(tokens)
        corpus_unique_tokens.update(tokens)

# Summary for the entire corpus
corpus_num_tokens = len(corpus_tokens)
corpus_num_unique_tokens = len(corpus_unique_tokens)

print("Corpus Summary:")
print(f"  Total tokens in corpus: {corpus_num_tokens}")
print(f"  Total unique tokens in corpus: {corpus_num_unique_tokens}")


Document: a1.txt
  Number of tokens: 1945
  Number of unique tokens: 673

Document: a10.txt
  Number of tokens: 1448
  Number of unique tokens: 567

Document: a2.txt
  Number of tokens: 4888
  Number of unique tokens: 1343

Document: a3.txt
  Number of tokens: 2191
  Number of unique tokens: 747

Document: a4.txt
  Number of tokens: 3364
  Number of unique tokens: 1037

Document: a5.txt
  Number of tokens: 3466
  Number of unique tokens: 1109

Document: a6.txt
  Number of tokens: 1782
  Number of unique tokens: 643

Document: a7.txt
  Number of tokens: 1643
  Number of unique tokens: 630

Document: a8.txt
  Number of tokens: 1836
  Number of unique tokens: 703

Document: a9.txt
  Number of tokens: 1602
  Number of unique tokens: 607

Corpus Summary:
  Total tokens in corpus: 24165
  Total unique tokens in corpus: 4105


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shalritvik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Stopword Removal

In [9]:
nltk.download('punkt')
nltk.download('stopwords')

# Load the list of English stop words
stop_words = set(stopwords.words('english'))

# Path to the folder containing the text files
folder_path = 'Data'

# Initialize variables to store tokens for the entire corpus
corpus_tokens_no_stopwords = []
corpus_unique_tokens_no_stopwords = set()

# Function to read, tokenize, and remove stop words from a single document
def tokenize_and_remove_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        # Tokenize the content
        tokens = word_tokenize(content)
        # Remove stop words
        filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
        return filtered_tokens

# Process each document in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        filtered_tokens = tokenize_and_remove_stopwords(file_path)
        
        # Count tokens and unique tokens for each document after removing stop words
        num_tokens_no_stopwords = len(filtered_tokens)
        num_unique_tokens_no_stopwords = len(set(filtered_tokens))
        print(f"Document: {filename}")
        print(f"  Number of tokens (after stop word removal): {num_tokens_no_stopwords}")
        print(f"  Number of unique tokens (after stop word removal): {num_unique_tokens_no_stopwords}\n")
        
        # Add filtered tokens to the corpus
        corpus_tokens_no_stopwords.extend(filtered_tokens)
        corpus_unique_tokens_no_stopwords.update(filtered_tokens)

# Summary for the entire corpus
corpus_num_tokens_no_stopwords = len(corpus_tokens_no_stopwords)
corpus_num_unique_tokens_no_stopwords = len(corpus_unique_tokens_no_stopwords)

print("Corpus Summary After Removing Stop Words:")
print(f"  Total tokens in corpus (after stop word removal): {corpus_num_tokens_no_stopwords}")
print(f"  Total unique tokens in corpus (after stop word removal): {corpus_num_unique_tokens_no_stopwords}")


Document: a1.txt
  Number of tokens (after stop word removal): 1244
  Number of unique tokens (after stop word removal): 545

Document: a10.txt
  Number of tokens (after stop word removal): 927
  Number of unique tokens (after stop word removal): 473

Document: a2.txt
  Number of tokens (after stop word removal): 3082
  Number of unique tokens (after stop word removal): 1152

Document: a3.txt
  Number of tokens (after stop word removal): 1382
  Number of unique tokens (after stop word removal): 639

Document: a4.txt
  Number of tokens (after stop word removal): 2196
  Number of unique tokens (after stop word removal): 885

Document: a5.txt
  Number of tokens (after stop word removal): 2254
  Number of unique tokens (after stop word removal): 960

Document: a6.txt
  Number of tokens (after stop word removal): 1149
  Number of unique tokens (after stop word removal): 543

Document: a7.txt
  Number of tokens (after stop word removal): 1063
  Number of unique tokens (after stop word remova

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shalritvik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shalritvik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Stemming/Lemmatization

In [17]:
# Downloading necessary resources from nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the list of English stop words
stop_words = set(stopwords.words('english'))

# Initialize Porter Stemmer and WordNet Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Path to the folder containing the text files
folder_path = 'Data'

# Functions to stem and lemmatize tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Function to read, tokenize, remove stop words, and apply stemming or lemmatization
def process_document(file_path, mode='stem'):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        # Tokenize the content
        tokens = word_tokenize(content)
        # Remove stop words
        filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
        # Apply stemming or lemmatization
        if mode == 'stem':
            processed_tokens = stem_tokens(filtered_tokens)
        elif mode == 'lemmatize':
            processed_tokens = lemmatize_tokens(filtered_tokens)
        return processed_tokens

# Choose mode: 'stem' for stemming, 'lemmatize' for lemmatization
mode = 'stem'  # Change to 'lemmatize' if you prefer lemmatization

# Initialize variables to store terms for the entire corpus
corpus_terms = []
corpus_unique_terms = set()

# Process each document in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        processed_tokens = process_document(file_path, mode=mode)
        
        # Count terms and unique terms for each document
        num_terms = len(processed_tokens)
        num_unique_terms = len(set(processed_tokens))
        print(f"Document: {filename} (Mode: {mode})")
        print(f"  Number of terms (after {mode}): {num_terms}")
        print(f"  Number of unique terms (after {mode}): {num_unique_terms}\n")
        
        # Add processed tokens to the corpus
        corpus_terms.extend(processed_tokens)
        corpus_unique_terms.update(processed_tokens)

# Summary for the entire corpus
corpus_num_terms = len(corpus_terms)
corpus_num_unique_terms = len(corpus_unique_terms)

print(f"Corpus Summary After {mode.capitalize()}:")
print(f"  Total terms in corpus (after {mode}): {corpus_num_terms}")
print(f"  Total unique terms in corpus (after {mode}): {corpus_num_unique_terms}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shalritvik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shalritvik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Shalritvik\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Document: a1.txt (Mode: stem)
  Number of terms (after stem): 1244
  Number of unique terms (after stem): 493

Document: a10.txt (Mode: stem)
  Number of terms (after stem): 927
  Number of unique terms (after stem): 432

Document: a2.txt (Mode: stem)
  Number of terms (after stem): 3082
  Number of unique terms (after stem): 985

Document: a3.txt (Mode: stem)
  Number of terms (after stem): 1382
  Number of unique terms (after stem): 573

Document: a4.txt (Mode: stem)
  Number of terms (after stem): 2196
  Number of unique terms (after stem): 770

Document: a5.txt (Mode: stem)
  Number of terms (after stem): 2254
  Number of unique terms (after stem): 829

Document: a6.txt (Mode: stem)
  Number of terms (after stem): 1149
  Number of unique terms (after stem): 498

Document: a7.txt (Mode: stem)
  Number of terms (after stem): 1063
  Number of unique terms (after stem): 487

Document: a8.txt (Mode: stem)
  Number of terms (after stem): 1201
  Number of unique terms (after stem): 522

D

Compute TF-IDF 

In [18]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer

# Path to the folder containing the text files
folder_path = 'Data'

# Function to read the content of all documents
def read_documents(folder_path):
    documents = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                documents.append(content)
                filenames.append(filename)
    return documents, filenames

# Read the documents
documents, filenames = read_documents(folder_path)

# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer and transform the documents into TF-IDF features
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()

# Display the TF-IDF features for each document
print("\nTF-IDF Features for Each Document:")
for i, filename in enumerate(filenames):
    print(f"\nDocument: {filename}")
    feature_index = tfidf_matrix[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    for idx, score in sorted(tfidf_scores, key=lambda x: -x[1]):
        print(f"  Term: {terms[idx]}, TF-IDF Score: {score:.4f}")



TF-IDF Features for Each Document:

Document: a1.txt
  Term: gyarados, TF-IDF Score: 0.8655
  Term: pokémon, TF-IDF Score: 0.2735
  Term: magikarp, TF-IDF Score: 0.1815
  Term: red, TF-IDF Score: 0.0929
  Term: flying, TF-IDF Score: 0.0623
  Term: blue, TF-IDF Score: 0.0619
  Term: water, TF-IDF Score: 0.0593
  Term: game, TF-IDF Score: 0.0568
  Term: chinese, TF-IDF Score: 0.0558
  Term: sea, TF-IDF Score: 0.0558
  Term: misty, TF-IDF Score: 0.0497
  Term: team, TF-IDF Score: 0.0464
  Term: player, TF-IDF Score: 0.0450
  Term: type, TF-IDF Score: 0.0450
  Term: rage, TF-IDF Score: 0.0419
  Term: crasher, TF-IDF Score: 0.0419
  Term: evolved, TF-IDF Score: 0.0408
  Term: series, TF-IDF Score: 0.0396
  Term: used, TF-IDF Score: 0.0372
  Term: wake, TF-IDF Score: 0.0356
  Term: lance, TF-IDF Score: 0.0356
  Term: battle, TF-IDF Score: 0.0340
  Term: described, TF-IDF Score: 0.0339
  Term: shiny, TF-IDF Score: 0.0311
  Term: dragon, TF-IDF Score: 0.0311
  Term: later, TF-IDF Score: 0.031

In [19]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer

# Path to the folder containing the text files
folder_path = 'Data'

# Function to read the content of all documents
def read_documents(folder_path):
    documents = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                documents.append(content)
                filenames.append(filename)
    return documents, filenames

# Read the documents
documents, filenames = read_documents(folder_path)

# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer and transform the documents into TF-IDF features
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()

# Open a text file to store the output
with open('TF-IDF.txt', 'w') as output_file:
    output_file.write("TF-IDF Features for Each Document:\n")
    
    # Store TF-IDF features for each document
    for i, filename in enumerate(filenames):
        output_file.write(f"\nDocument: {filename}\n")
        feature_index = tfidf_matrix[i, :].nonzero()[1]
        tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
        for idx, score in sorted(tfidf_scores, key=lambda x: -x[1]):
            output_file.write(f"  Term: {terms[idx]}, TF-IDF Score: {score:.4f}\n")

print("TF-IDF features saved to 'TF-IDF.txt'.")


TF-IDF features saved to 'TF-IDF.txt'.


Compute the Cosine Similarity

In [20]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Path to the folder containing the text files
folder_path = 'Data'

# Function to read the content of all documents
def read_documents(folder_path):
    documents = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                documents.append(content)
                filenames.append(filename)
    return documents, filenames

# Read the documents
documents, filenames = read_documents(folder_path)

# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer and transform the documents into TF-IDF features
tfidf_matrix = vectorizer.fit_transform(documents)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

# Save the similarity matrix to a file
with open('Cosine_Similarity_Matrix.txt', 'w') as output_file:
    output_file.write("Cosine Similarity Matrix for the Document Corpus:\n\n")

    # Write the header (filenames)
    output_file.write("\t" + "\t".join(filenames) + "\n")
    
    # Write the similarity scores for each document pair
    for i, filename in enumerate(filenames):
        similarity_scores = "\t".join([f"{similarity_matrix[i, j]:.4f}" for j in range(len(filenames))])
        output_file.write(f"{filename}\t{similarity_scores}\n")

print("Cosine similarity matrix saved to 'Cosine_Similarity_Matrix.txt'.")


Cosine similarity matrix saved to 'Cosine_Similarity_Matrix.txt'.


: 