### Task Details:
#### In this task, you are required to implement topic classification using the Naïve Bayes classifier using the following instructions:

#### 1. Loading Data: Download 5 different documents at least for training in 2 different domains (5 documents per each domain) and one document for test in each domain using Wikipedia API in python. Each document has to be at least one page (100 Words)

In [1]:
import wikipedia

# 2 Different Domains
DOMAINS = ["Machine learning", "Data science"]

def loading_data(domains:list) -> list:
    """
     Download 5 different documents at least for training in 2 different domains 
     (5 documents per each domain) and one document for test in each domain using Wikipedia API in python. 
     Each document has to be at least one page (100 Words)
        
    Args:
    - domains: list of domains of wikipedia pages
    
    Returns:
    - titles: list of documents titles
    """
    titles = []
    pages = []
    for domain in domains:
        wiki_titles = wikipedia.search(domain,results=6)
        for title in wiki_titles:
            page = wikipedia.WikipediaPage(title=title).summary
            if wiki_titles.index(title) != 5:    
                temp_doc = open(r"C:\Users\ghost\Desktop\nlp assignment\Train_{}.txt".format(title),"w")
                temp_doc.write(page)
                temp_doc.close()
                titles.append(title)
            else:
                temp_doc = open(r"C:\Users\ghost\Desktop\nlp assignment\Test_{}.txt".format(title),"w")
                temp_doc.write(page)
                temp_doc.close()

    return titles           
titles = loading_data(domains=DOMAINS)
print(titles)

['Machine learning', 'Quantum machine learning', 'Attention (machine learning)', 'Transformer (machine learning model)', 'Boosting (machine learning)', 'Data science', 'Data', 'Data (computer science)', 'Master in Data Science', 'Data analysis']


#### 2. Function class_prob (doc_label) that takes all training documents labels and calculate the probability of each class.

In [2]:
def class_prob(doc_labels:list) -> dict:
    """
    Calculates the probability of each class given a list of document labels.
    
    Args:
    - doc_labels: a list of document labels
    
    Returns:
    - A dictionary where the keys are the unique labels in doc_labels and the values are the corresponding probabilities.
    """
    class_counts = {}
    for label in doc_labels:
        class_counts[label] = class_counts.get(label, 0) + 1
    total_docs = len(doc_labels)
    class_probs = {label: count / total_docs for label, count in class_counts.items()}
    return class_probs

DOC_LABELS = []
for title in titles:
    if  'machine'.casefold() in title.casefold() :
        DOC_LABELS.append("Machine Learning")
    if  'data'.casefold() in title.casefold() :
        DOC_LABELS.append("Data Science")
    
print(DOC_LABELS)
CLASS_PROBS = class_prob(DOC_LABELS)
print(CLASS_PROBS)

['Machine Learning', 'Machine Learning', 'Machine Learning', 'Machine Learning', 'Machine Learning', 'Data Science', 'Data Science', 'Data Science', 'Data Science', 'Data Science']
{'Machine Learning': 0.5, 'Data Science': 0.5}


#### 3. Function preprocessing (doc) that takes training documents and return it after needed preprocessing.

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from string import punctuation
import glob
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import os 


def tokenize(text):
    """
    Tokenizes text by splitting it into words.

    Args:
    - text: a string of text

    Returns:
    - A list of tokens.
    """
    # Define a list of characters to split the text on
    delimiters = [' ', '\n', '\t', '.', ',', ';', ':', '!', '?', '"', "'", '(', ')', '[', ']', '{', '}', '<', '>', '/', '\\', '|', '@', '#', '$', '%', '^', '&', '*', '-', '_', '+', '=', '`', '~']
    
    # Split text into words
    words = []
    word = ''
    for char in text:
        if char in delimiters:
            if word:
                words.append(word)
                word = ''
        else:
            word += char
    if word:
        words.append(word)

    return words

def preprocessing(documents:list) -> list:
    """
    Preprocesses a list of training documents by removing stop words, punctuation, lowercasing all text, 
    and applying stemming and lemmatization.

    Args:
    - documents: a list of training documents

    Returns:
    - A list of preprocessed documents.
    """
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    preprocessed_documents = []
    for file in documents:
        # Tokenize text into individual words
        with open(file,"r") as doc:
            tokens = tokenize(doc.read())
            # Remove stop words and punctuation from the text
            filtered_tokens = [token.lower() for token in tokens if (token not in stop_words) and (token not in string.punctuation)]
            # Stem and lemmatize the filtered tokens
            stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
            lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
            # Join the lemmatized tokens back into a string
            preprocessed_doc = ' '.join(lemmatized_tokens)
            preprocessed_documents.append(preprocessed_doc)
    return preprocessed_documents


docs_path = r"C:\Users\ghost\Desktop\nlp assignment"
train_docs = glob.glob(os.path.join(docs_path,"Train_*.txt"))
train_docs =preprocessing(train_docs) 
test_docs = glob.glob(os.path.join(docs_path,"Test_*.txt"))
test_docs = preprocessing(test_docs)

train_machine = [train_docs[0],train_docs[1],train_docs[6],train_docs[8],train_docs[9]]
train_data = [train_docs[2],train_docs[3],train_docs[4],train_docs[5],train_docs[7]]

print(test_docs[0])

adversari machin learn studi attack machin learn algorithm defens attack a survey may 2020 expo fact practition report dire need better protect machin learn system industri applic to understand note machin learn techniqu mostli design work specif problem set assumpt train test data gener statist distribut iid howev assumpt often danger violat practic high stake applic user may intent suppli fabric data violat statist assumpt some common threat model adversari machin learn includ evas attack data poison attack byzantin attack model extract


In [5]:
def unique_words_count(my_string):
    words = my_string.split()
    unique_words = set(words)
    return len(unique_words)

def count_string_in_list(lst, string):
    count = 0
    words = lst.split()
    for i in words:
        if i == string:
            count += 1
    return count

def count_words(preprocessed_doc):
    """Counts the number of words in a preprocessed document.

    Args:
      - preprocessed_doc: The preprocessed document.

    Returns:
      - The number of words in the preprocessed document.
    """
    
    # Split the document into words.
    words = preprocessed_doc.split()

    # Count the number of words.
    num_words = len(words)

    return num_words




def conditional_prob(class_label, word, train_docs):
    
    word_num = []
    if(class_label == 'Machine Learning'):
        train = train_machine
        string = ' '.join(train_machine)
    elif (class_label == 'Data Science'):
        train = train_data
        string = ' '.join(train_data)
        
    word_counter=0
    for i in train:
        word_num.append(count_words(i))
        word_counter += count_string_in_list(i, word)
            
    k = unique_words_count(string)
    
    conditional_probability = (word_counter + 1) / k
    
    return conditional_probability


In [6]:
class_label = "Data Science"
word = "adversari"

conditional_probability = conditional_prob(class_label, word, train_docs)
print(conditional_probability)

0.0028328611898017


In [7]:
prob_machine = {}
class_label1 = "Machine Learning"

for i in train_machine: 
    words = i.split()
    for j in words:
        prob_machine[(class_label1,j)] = conditional_prob(class_label1, j, train_docs)
prob_machine

{('Machine Learning', 'in'): 0.014619883040935672,
 ('Machine Learning', 'artifici'): 0.008771929824561403,
 ('Machine Learning', 'neural'): 0.02046783625730994,
 ('Machine Learning', 'network'): 0.02046783625730994,
 ('Machine Learning', 'attent'): 0.02631578947368421,
 ('Machine Learning', 'techniqu'): 0.011695906432748537,
 ('Machine Learning', 'meant'): 0.005847953216374269,
 ('Machine Learning', 'mimic'): 0.008771929824561403,
 ('Machine Learning', 'cognit'): 0.005847953216374269,
 ('Machine Learning', 'the'): 0.014619883040935672,
 ('Machine Learning', 'effect'): 0.005847953216374269,
 ('Machine Learning', 'enhanc'): 0.008771929824561403,
 ('Machine Learning', 'part'): 0.017543859649122806,
 ('Machine Learning', 'input'): 0.02046783625730994,
 ('Machine Learning', 'data'): 0.05555555555555555,
 ('Machine Learning', 'diminish'): 0.005847953216374269,
 ('Machine Learning', '—'): 0.005847953216374269,
 ('Machine Learning', 'motiv'): 0.005847953216374269,
 ('Machine Learning', 'devot

In [8]:
prob_data = {}
class_label2 = "Data Science"

for i in train_data: 
    words = i.split()
    for j in words:
        prob_data[(class_label2,j)] = conditional_prob(class_label2, j, train_docs)
        
prob_data

{('Data Science', 'in'): 0.0226628895184136,
 ('Data Science', 'comput'): 0.042492917847025496,
 ('Data Science', 'scienc'): 0.059490084985835696,
 ('Data Science', 'data'): 0.2577903682719547,
 ('Data Science', 'treat'): 0.0056657223796034,
 ('Data Science', 'singular'): 0.0056657223796034,
 ('Data Science', 'plural'): 0.0056657223796034,
 ('Data Science', 'mass'): 0.0056657223796034,
 ('Data Science', 'noun'): 0.0056657223796034,
 ('Data Science', 'sequenc'): 0.0084985835694051,
 ('Data Science', 'one'): 0.0084985835694051,
 ('Data Science', 'symbol'): 0.014164305949008499,
 ('Data Science', 'datum'): 0.0084985835694051,
 ('Data Science', 'singl'): 0.0056657223796034,
 ('Data Science', 'requir'): 0.0056657223796034,
 ('Data Science', 'interpret'): 0.0084985835694051,
 ('Data Science', 'becom'): 0.0056657223796034,
 ('Data Science', 'inform'): 0.05099150141643059,
 ('Data Science', 'digit'): 0.0226628895184136,
 ('Data Science', 'repres'): 0.0226628895184136,
 ('Data Science', 'use'):

In [9]:
prob_data.update(prob_machine)


probab = prob_data
print(probab)


{('Data Science', 'in'): 0.0226628895184136, ('Data Science', 'comput'): 0.042492917847025496, ('Data Science', 'scienc'): 0.059490084985835696, ('Data Science', 'data'): 0.2577903682719547, ('Data Science', 'treat'): 0.0056657223796034, ('Data Science', 'singular'): 0.0056657223796034, ('Data Science', 'plural'): 0.0056657223796034, ('Data Science', 'mass'): 0.0056657223796034, ('Data Science', 'noun'): 0.0056657223796034, ('Data Science', 'sequenc'): 0.0084985835694051, ('Data Science', 'one'): 0.0084985835694051, ('Data Science', 'symbol'): 0.014164305949008499, ('Data Science', 'datum'): 0.0084985835694051, ('Data Science', 'singl'): 0.0056657223796034, ('Data Science', 'requir'): 0.0056657223796034, ('Data Science', 'interpret'): 0.0084985835694051, ('Data Science', 'becom'): 0.0056657223796034, ('Data Science', 'inform'): 0.05099150141643059, ('Data Science', 'digit'): 0.0226628895184136, ('Data Science', 'repres'): 0.0226628895184136, ('Data Science', 'use'): 0.05099150141643059

In [11]:
import pickle

with open('train.pickle', 'wb') as handle:
    pickle.dump(probab, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open('train.pickle', 'rb') as handle:
    b = pickle.load(handle)
    
b

{('Data Science', 'in'): 0.0226628895184136,
 ('Data Science', 'comput'): 0.042492917847025496,
 ('Data Science', 'scienc'): 0.059490084985835696,
 ('Data Science', 'data'): 0.2577903682719547,
 ('Data Science', 'treat'): 0.0056657223796034,
 ('Data Science', 'singular'): 0.0056657223796034,
 ('Data Science', 'plural'): 0.0056657223796034,
 ('Data Science', 'mass'): 0.0056657223796034,
 ('Data Science', 'noun'): 0.0056657223796034,
 ('Data Science', 'sequenc'): 0.0084985835694051,
 ('Data Science', 'one'): 0.0084985835694051,
 ('Data Science', 'symbol'): 0.014164305949008499,
 ('Data Science', 'datum'): 0.0084985835694051,
 ('Data Science', 'singl'): 0.0056657223796034,
 ('Data Science', 'requir'): 0.0056657223796034,
 ('Data Science', 'interpret'): 0.0084985835694051,
 ('Data Science', 'becom'): 0.0056657223796034,
 ('Data Science', 'inform'): 0.05099150141643059,
 ('Data Science', 'digit'): 0.0226628895184136,
 ('Data Science', 'repres'): 0.0226628895184136,
 ('Data Science', 'use'):

In [14]:
print(CLASS_PROBS)


def predict(test_docs):
    prob_data = 1

    prob_machine = 1
    
    words = test_docs.split()
    
    for i in words:
        
        prob_data = conditional_prob('Data Science', i, train_docs) * prob_data
        prob_machine =  conditional_prob('Machine Learning', i, train_docs) * prob_machine
        
        if prob_machine/prob_data>1:
            
            return "The document classified as a Macine Learning Topic"
        
        else:
            
            return "The document is classified as a Data Science Topic"


predict(test_docs[0])

{'Machine Learning': 0.5, 'Data Science': 0.5}


'The document classified as a Macine Learning Topic'