<a href="https://colab.research.google.com/github/PRaliphada/ExpectedLoss/blob/main/NLP_LAB2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import os
import re
from collections import defaultdict
from nltk import ngrams
from nltk.tokenize import word_tokenize
import random
import nltk

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')

# Load a single text file (HP2.txt)
def load_corpus(file_path):
    corpus = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        corpus[file_path] = file.read().lower()
    return corpus

# Preprocessing function: remove punctuation and tokenize
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)         # Tokenize text
    return tokens

# Generate N-grams from tokens
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Split the book text into chunks (simulating pages)
def split_into_chunks(text, chunk_size=200):
    tokens = preprocess(text)
    chunks = [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
    return chunks

# Prepare the data with labels
def prepare_data(corpus, ngram_size=2, chunk_size=200):
    labeled_data = []
    for book_title, text in corpus.items():
        chunks = split_into_chunks(text, chunk_size)
        for chunk in chunks:
            ngrams_list = generate_ngrams(preprocess(chunk), ngram_size)
            labeled_data.append((book_title, ngrams_list))
    return labeled_data

# Split data into train, validation, and test sets
def split_data(data, train_ratio=0.7, val_ratio=0.15):
    random.shuffle(data)

    train_size = int(len(data) * train_ratio)
    val_size = int(len(data) * val_ratio)

    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data[train_size + val_size:]

    return train_data, val_data, test_data

# Naive Bayes Classifier Implementation
class NaiveBayesClassifier:
    def __init__(self):
        self.class_counts = defaultdict(int)
        self.feature_counts = defaultdict(lambda: defaultdict(int))
        self.vocab = set()

    def train(self, data):
        total_documents = len(data)
        for label, ngrams in data:
            self.class_counts[label] += len(ngrams)
            for ngram in ngrams:
                self.feature_counts[ngram][label] += 1
                self.vocab.update(ngram)

    def predict(self, ngrams):
        label_scores = defaultdict(float)

        for label in self.class_counts:
            prior = self.class_counts[label] / sum(self.class_counts.values())
            log_prob = 0
            for ngram in ngrams:
                ngram_freq = self.feature_counts[ngram][label] + 1  # Laplace smoothing
                log_prob += ngram_freq / (self.class_counts[label] + len(self.vocab))

            label_scores[label] = prior * log_prob

        return max(label_scores, key=label_scores.get)

    def evaluate(self, data):
        correct = 0
        total = 0
        for label, ngrams in data:
            prediction = self.predict(ngrams)
            if prediction == label:
                correct += 1
            total += 1

        return correct / total if total > 0 else 0

# Main execution
def main():
    # Load and preprocess the HP2.txt corpus
    corpus = load_corpus('HP2.txt')

    # Prepare labeled data with N-grams
    ngram_size = 2  # For bigrams, adjust as needed
    chunk_size = 200  # Simulate pages of 200 words each
    labeled_data = prepare_data(corpus, ngram_size, chunk_size)

    # Split data into train, validation, and test sets
    train_data, val_data, test_data = split_data(labeled_data)

    # Train the model
    nb_classifier = NaiveBayesClassifier()
    nb_classifier.train(train_data)

    # Evaluate on validation set
    validation_accuracy = nb_classifier.evaluate(val_data)
    print(f'Validation Accuracy: {validation_accuracy * 100:.2f}%')

    # Evaluate on test set
    test_accuracy = nb_classifier.evaluate(test_data)
    print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

    # Print the sizes of the datasets
    print(f'Train data size: {len(train_data)}')
    print(f'Validation data size: {len(val_data)}')
    print(f'Test data size: {len(test_data)}')

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Validation Accuracy: 100.00%
Test Accuracy: 100.00%
Train data size: 299
Validation data size: 64
Test data size: 65


In [16]:
import os
import re
from collections import defaultdict
from nltk import ngrams
from nltk.tokenize import word_tokenize
import random
import nltk

#Download necessary NLTK resources.
nltk.download('punkt')

#Loading Herry Porter books.
def load_corpus(directory):
    corpus = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                corpus[filename] = file.read().lower()
    return corpus

#Preprocessing the data by removing punctuation and tokenizing the text.
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

#Generating N-grams from tokens.
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

#Split the book text into chunks (simulating pages).
def split_into_chunks(text, chunk_size=200):
    tokens = preprocess(text)
    chunks = [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
    return chunks

#Prepare the data with labels
def prepare_data(corpus, ngram_size=2, chunk_size=200):
    labeled_data = []
    for book_title, text in corpus.items():
        chunks = split_into_chunks(text, chunk_size)
        for chunk in chunks:
            ngrams_list = generate_ngrams(preprocess(chunk), ngram_size)
            labeled_data.append((book_title, ngrams_list))
    return labeled_data

#Split data into train, validation, and test sets
def split_data(data, train_ratio=0.7, val_ratio=0.15):
    random.shuffle(data)

    train_size = int(len(data) * train_ratio)
    val_size = int(len(data) * val_ratio)

    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data[train_size + val_size:]

    return train_data, val_data, test_data

#Implementating Naive Bayes Classifier
class NaiveBayesClassifier:
    def __init__(self):
        self.class_counts = defaultdict(int)
        self.feature_counts = defaultdict(lambda: defaultdict(int))
        self.vocab = set()

    def train(self, data):
        total_documents = len(data)
        for label, ngrams in data:
            self.class_counts[label] += len(ngrams)
            for ngram in ngrams:
                self.feature_counts[ngram][label] += 1
                self.vocab.update(ngram)

    def predict(self, ngrams):
        label_scores = defaultdict(float)

        for label in self.class_counts:
            prior = self.class_counts[label] / sum(self.class_counts.values())
            log_prob = 0
            for ngram in ngrams:
                ngram_freq = self.feature_counts[ngram][label] + 1
                log_prob += ngram_freq / (self.class_counts[label] + len(self.vocab))

            label_scores[label] = prior * log_prob

        return max(label_scores, key=label_scores.get)

    def evaluate(self, data):
        correct = 0
        total = 0
        for label, ngrams in data:
            prediction = self.predict(ngrams)
            if prediction == label:
                correct += 1
            total += 1

        return correct / total if total > 0 else 0


def main():
    corpus_directory = '/content/'  #Directory containing all the books
    corpus = load_corpus(corpus_directory)

    #Prepare labeled data with N-grams
    ngram_size = 2
    chunk_size = 200
    labeled_data = prepare_data(corpus, ngram_size, chunk_size)

    #Spliting the data into train, validation, and test datasets
    train_data, val_data, test_data = split_data(labeled_data)

    #Training the model
    nb_classifier = NaiveBayesClassifier()
    nb_classifier.train(train_data)

    #Evaluation on validation dataset
    validation_accuracy = nb_classifier.evaluate(val_data)
    print(f'Validation Accuracy: {validation_accuracy * 100:.2f}%')

    #Evaluation on test dataset
    test_accuracy = nb_classifier.evaluate(test_data)
    print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

    #Printing the sizes of the datasets that are used for Training, Validation and Testing.
    print(f'Train data size: {len(train_data)}')
    print(f'Validation data size: {len(val_data)}')
    print(f'Test data size: {len(test_data)}')

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Validation Accuracy: 31.78%
Test Accuracy: 31.10%
Train data size: 3820
Validation data size: 818
Test data size: 820
