# CS361 Assignment 2
### Name: Lin Lin
### UPI: llin829
----------------------------------------------------------------------------------------------------------------------------------------------------------

## Task 1 Data Preprocessing

In [76]:
import pandas as pd
from collections import Counter
import math
import string

data = pd.read_csv("C:/Users/heave/Desktop/CS361/A2/trg.csv")
data.head()

Unnamed: 0,id,class,abstract
0,1,B,the 4 202 353 bp genome of the alkaliphilic ba...
1,2,A,the complete 1751377-bp sequence of the genome...
2,3,E,in 1992 we started assembling an ordered libra...
3,4,E,the aim of this study is to measure human mito...
4,5,B,the amino acid sequence of the spirulina maxim...


In [77]:
X = data['abstract']
y = data['class']
X.head()

0    the 4 202 353 bp genome of the alkaliphilic ba...
1    the complete 1751377-bp sequence of the genome...
2    in 1992 we started assembling an ordered libra...
3    the aim of this study is to measure human mito...
4    the amino acid sequence of the spirulina maxim...
Name: abstract, dtype: object

#### Split data into training and validation set

In [78]:
# Split data set into training and test set
split_index = int(len(data)*0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

#### Count number of abstracts for each class

In [79]:
def classify_and_count(X_train, y_train):
    # Initialize the variable of number of class
    number_of_classes = {'A': 0, 'B': 0, 'E': 0, 'V': 0}
    
    # Initialize the lists for each class
    class_lists = {'A': [], 'B': [], 'E': [], 'V': []}
    # Loop over the training data, tuple will be like (text, label)
    for text, label in zip(X_train, y_train):
        words = text.split()
        if label in number_of_classes:
            number_of_classes[label] += 1
            class_lists[label].extend(words)
    return number_of_classes, class_lists

number_of_classes, class_lists = classify_and_count(X_train, y_train)
print(number_of_classes)

{'A': 100, 'B': 1296, 'E': 1703, 'V': 101}


#### Extract unique words for each class

In [80]:
# Extract uniques words for each class
def extract_unique_words(class_lists):
    unique_words = {}
    for label, words in class_lists.items():
        unique_words[label] = set(words)
    return unique_words

unique_words = extract_unique_words(class_lists)
word_counts = {label: len(words) for label, words in unique_words.items()}
print("Number of unique words per class:", word_counts)

Number of unique words per class: {'A': 2009, 'B': 11676, 'E': 20700, 'V': 3082}


#### Extract unique words for all classes

In [81]:
all_unique_words = set()

# Add words from each class to the all_unique_words set
for words in unique_words.values():
    all_unique_words.update(words)

# Convert the set to a list if you need a list format
all_unique_words_list = list(all_unique_words)
print("Number of unique words for all classes:", len(all_unique_words_list))

Number of unique words for all classes: 28541


## Task 2 Implement the standard Naive Bayes base on text book algorithm

In [82]:
def train_naive_bayes(X_train, y_train):
    # Total number of documents
    total_docs = len(X_train)
    
    # Calculate the prior probability for each class and add it to the class_priors dictionary
    class_priors = {}
    for label, count in number_of_classes.items():
        class_priors[label] = count / total_docs
    
    # Total vocabulary size for Laplace smoothing
    vocab_size = len(all_unique_words)
    
    # Calculate conditional probabilities P(wk|vj)
    word_given_class = {}
    for label, words in class_lists.items():
        word_count = len(words)
        word_freqs = Counter(words)
        word_given_class[label] = {word: (word_freqs[word] + 1) / (word_count + vocab_size) for word in all_unique_words}
    return class_priors, word_given_class, vocab_size
    
        
class_priors, word_given_class, vocab_size = train_naive_bayes(X_train, y_train)
print(class_priors) 

#first_key = list(word_given_class.keys())[0]
#first_value = word_given_class[first_key]

#print("First key:", first_key)
#print("First value:", first_value)

{'A': 0.03125, 'B': 0.405, 'E': 0.5321875, 'V': 0.0315625}


#### Make prediction on validation set

In [83]:
def predict_naive_bayes(X_test, class_priors, word_given_class, vocab_size):
    predictions = []
    for doc in X_test:
        words = doc.split()
        doc_probs = {}
        for class_label, priors in class_priors.items():
            # Start with the log of the prior probability
            doc_prob = math.log(priors)
            for word in words:
                if word in word_given_class[class_label]:
                    doc_prob += math.log(word_given_class[class_label][word])
                else:
                    # Handle unseen words with smoothing
                    doc_prob += math.log(1 / (sum(word_given_class[class_label].values()) + vocab_size))
            doc_probs[class_label] = doc_prob
        # Choose the class with the highest probability
        predictions.append(max(doc_probs, key=doc_probs.get))
    return predictions

predictions = predict_naive_bayes(X_test, class_priors, word_given_class, vocab_size)

def calculate_accuracy(predictions, y_test):
    actual_labels = list(y_test)
    
    # Count correct predictions
    correct_predictions = 0
    for predicted, actual in zip(predictions, actual_labels):
        if predicted == actual:
            correct_predictions += 1
    
    # Calculate accuracy
    accuracy = correct_predictions / len(predictions)
    return accuracy

# Calculate the accuracy of the classifier
accuracy = calculate_accuracy(predictions, y_test)
print("Accuracy of the Naive Bayes classifier:", accuracy)

Accuracy of the Naive Bayes classifier: 0.94625


## Task 3 Improve the model

#### Words Concatenation

In [84]:
def preprocess_text(text):
    # Dictionary of terms to concatenate
    concatenations = {
        'homo sapiens': 'homo_sapiens',
        'escherichia coli': 'escherichia_coli',
        'human immunodeficiency virus': 'human_immunodeficiency_virus'
    }
    
    # Replace each specified term with its concatenated version
    for term, concat_term in concatenations.items():
        text = text.replace(term, concat_term)
    
    return text

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]

#### Stop words

In [88]:
stop_words = set([
    'the', 'a', 'an', 'of'])

def remove_words(text):
    # Convert to lowercase and remove punctuation
    text = ''.join([char.lower() for char in text if char not in string.punctuation])
    
    # Remove stop words
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)

# Example usage with X_train and X_test being defined previously
X_train_processed = [remove_words(x) for x in X_train]
X_test_processed = [remove_words(x) for x in X_test]


number_of_classes, class_lists = classify_and_count(X_train_processed, y_train)
class_priors, word_given_class, vocab_size = train_naive_bayes(X_train_processed, y_train)
predictions = predict_naive_bayes(X_test_processed, class_priors, word_given_class, vocab_size)
accuracy = calculate_accuracy(predictions, y_test)
print(f"Accuracy of the Naive Bayes classifier after removing stop words: {accuracy:.4f}%")

Accuracy of the Naive Bayes classifier after removing stop words: 0.9450%


#### Prediction on test data

In [89]:
test_data = pd.read_csv("C:/Users/heave/Desktop/CS361/A2/tst.csv")

X_test_unseen = test_data['abstract']

# Make predictions using the previously trained model
predictions_unseen = predict_naive_bayes(X_test_unseen, class_priors, word_given_class, vocab_size)

# Append predictions to the test data for reviewz
test_data['predicted_class'] = predictions_unseen

# Display the first few rows to verify the predictions
print(test_data.head())

   id                                           abstract predicted_class
0   1  in a previous work all three components of com...               A
1   2  we compared morphology of two geographically c...               E
2   3  factor xiii mr 320000 is a blood coagulation f...               E
3   4  we report the characterisation of a human gene...               E
4   5  fat tissue plays a critical role in the regula...               E


#### Output file

In [91]:
output_file_path = 'C:/Users/heave/Desktop/CS361/A2/predicted_classes_2.csv'

# Save the DataFrame to a CSV filez
test_data.to_csv(output_file_path, index=False)  # Set index=False to avoid writing row indices in the file