In [None]:
%%capture
!pip install wget

In [None]:
import numpy as np
import pandas as pd
# Please add other necessary imports here
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import wget
from pathlib import Path
filename = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/20_newsgroups.zip", "20_newsgroups.zip")
_ = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/training_files_Q7.txt", "training_files_Q7.txt")
_ = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/testing_files_Q7.txt", "testing_files_Q7.txt")

In [None]:
%%capture
!unzip 20_newsgroups.zip

In [None]:
DATA_DIR = "20_newsgroups"
ALL_FILES = [pth for pth in Path(DATA_DIR).glob("**/*") if pth.is_file() and not pth.name.startswith(".")]

# Q7

## Q7(a)

use the following code cell to implement your feature encoding

In [None]:
def data_q7(file_list, num_words=1000):
    X, y = None, None

    tokenizer = RegexpTokenizer(r"\b[a-zA-Z]+\b")
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    def clean_file_text(text):
        new_text = re.sub("Newsgroups:.*?\n", "", text)
        new_text = re.sub("Xref:.*?\n", "", new_text)
        new_text = re.sub("Path:.*?\n", "", new_text)
        new_text = re.sub("Date:.*?\n", "", new_text)
        new_text = re.sub("Followup-To:.*?\n", "", new_text)
        new_text = re.sub("Lines:.*?\n", "", new_text)
        new_text = re.sub("Reply-To:.*?\n", "", new_text)
        new_text = re.sub("Message-ID:.*?\n", "", new_text)
        new_text = re.sub("From:.*?\n", "", new_text)
        new_text = re.sub("NNTP-Posting-Host:.*?\n", "", new_text)
        return new_text

    def get_topic_name(file_path):
        return file_path.parent.name

    def get_target(topic_name):
        topics = ["talk.politics.mideast", "rec.autos", "comp.sys.mac.hardware", "alt.atheism", "rec.sport.baseball",
        "comp.os.ms-windows.misc", "rec.sport.hockey", "sci.crypt", "sci.med", "talk.politics.misc",
        "rec.motorcycles", "comp.windows.x", "comp.graphics", "comp.sys.ibm.pc.hardware", "sci.electronics",
        "talk.politics.guns", "sci.space", "soc.religion.christian", "misc.forsale", "talk.religion.misc"]
        return topics.index(topic_name)

    def stemmed_tokenizer(doc):
        tokens = tokenizer.tokenize(doc)
        tokens = [token.lower() for token in tokens]  # Convert to lowercase
        tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
        stemmed = [stemmer.stem(token) for token in tokens]  # Stemming
        return stemmed

    def load_file_content(file_path):
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = clean_file_text(file.read())
            return content

    y = [get_target(get_topic_name(file_path)) for file_path in ALL_FILES]
    documents = [load_file_content(file_path) for file_path in ALL_FILES]

    # vectorizer = CountVectorizer(max_features=20000, tokenizer=stemmed_tokenizer) -> not good as tfidf
    vectorizer = TfidfVectorizer(max_features=8000, tokenizer=stemmed_tokenizer)
    X_prime = vectorizer.fit_transform(documents)
    X = pd.DataFrame(X_prime.toarray(), index=[str(f) for f in ALL_FILES], columns=vectorizer.get_feature_names_out())

    print("finish encoding")
    assert isinstance(X, pd.DataFrame) and isinstance(y, list), "incorrect return types"
    return X, y

## Q7(b)

Use the following code cell to implement your model

In [None]:
def build_model_q7():

    # mnnb = MultinomialNB(alpha=0.01,) -> not good as logistic regression
    # mlp = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=10, random_state=42, alpha=0.01,
    #     solver='adam', verbose=1, learning_rate='adaptive', activation='relu', early_stopping=False)
    MODELQ7 = LogisticRegression(C=10, max_iter=500, class_weight='balanced', multi_class='ovr', n_jobs=-1)
    return MODELQ7

Code for evaluating p at k

In [None]:
def calculate_average_precision_at_k(model_q7, data_func, all_files, training_files, testing_files, k=None):

    training_files = [str(f) for f in open(training_files, mode='r').read().splitlines()]
    testing_files = [str(f) for f in open(testing_files, mode='r').read().splitlines()]
    if k is None:
        k = len(testing_files)

    X, y = data_func(all_files)
    X["gt"] = y
    training = X.loc[training_files]
    X_train = training.loc[:, training.columns!="gt"]
    y_train = training["gt"].values

    testing = X.loc[testing_files]
    X_test = testing.loc[:, testing.columns!="gt"]
    y_test = testing["gt"].values

    model_q7.fit(X_train, y_train)
    y_pred = model_q7.predict(X_test)
    y_pred_prob = model_q7.predict_proba(X_test)
    confidences = np.max(y_pred_prob, axis=1)

    p_at_k = []
    rel_at_k = []
    confidence_order = np.argsort(confidences)
    for i in range(1, k+1):
        top_confidence = confidence_order[-i:]
        pred_top_i = y_pred[top_confidence]
        gt_top_i = np.array(y_test)[top_confidence]
        p_at_i = np.sum(pred_top_i == gt_top_i) / i
        rel_at_i = (pred_top_i[0] == gt_top_i[0])
        p_at_k.append(p_at_i)
        rel_at_k.append(rel_at_i)
    print(f"average precision at {k} is {np.dot(p_at_k, rel_at_k) / k}")

    # val = X.loc[X.index.difference(set(training_files)|set(testing_files)), :]
    # X_val = val.loc[ :, val.columns!="gt"]
    # y_val = val["gt"].values

    # y_pred = model_q7.predict(X_val)
    # y_pred_prob = model_q7.predict_proba(X_val)
    # confidences = np.max(y_pred_prob, axis=1)

    # p_at_k = []
    # rel_at_k = []
    # confidence_order = np.argsort(confidences)
    # for i in range(1, len(y_val)+1):
    #     top_confidence = confidence_order[-i:]
    #     pred_top_i = y_pred[top_confidence]
    #     gt_top_i = np.array(y_val)[top_confidence]
    #     p_at_i = np.sum(pred_top_i == gt_top_i) / i
    #     rel_at_i = (pred_top_i[0] == gt_top_i[0])
    #     p_at_k.append(p_at_i)
    #     rel_at_k.append(rel_at_i)
    # print(f"average precision at {len(y_val)} is {np.dot(p_at_k, rel_at_k) / len(y_val)}")

    return np.dot(p_at_k, rel_at_k) / k

In [None]:
m = calculate_average_precision_at_k(build_model_q7(), data_q7, ALL_FILES, "training_files_Q7.txt", "testing_files_Q7.txt")



finish encoding
average precision at 4000 is 0.83744161797728


# Q7(c)

**Feature Set and Feature Encoding:**

The chosen feature set in this scenario is based on TF-IDF encoding. This encoding helps capture the importance of terms in distinguishing different topics within the text. The words in the document were first tokenized using a RegesTokenizer then covert to lower case. After, stop words are removed from the list of tokens and each tokens are stemmed.

**Classifier:**

The chosen classifier is Logistic Regression. Logistic Regression is a well-established classification algorithm that works well for text classification tasks. It can handle high-dimensional feature spaces, making it a good choice when using TF-IDF encoding.

TF-IDF encoding was chosen because it is effective for text classification tasks. It takes into account both term frequency and inverse document frequency, which helps in capturing the importance of words in distinguishing between different topics. It also works well with text data have a large number of features.

Logistic Regression was chosen because it is a simple but effective classification model. It outperforms the other methods I have tried.

Final: 0.838 with 8000 num of word

**multiNB** \
2000 T: 0.75316 & 0.750048 | C: 0.684028 \
4000 T: 0.800236 & 0.794596 | C: 0.727337 & 0.721242 \
10000 T: 0.836234 & 0.831411 | C: 0.767681 & 0.760828\
20000 T: 0.849466 & 0.845272 | C: 0.779938 & 0.774953


**LR**  
4000 T: 0.814094 & 0.810094 \
8000 T: 0.838160 & 0.836946 \
10000 T: 0.844272 & 0.842257\

**MLP**\
4000 T: 0.802547 & 0.800840 \
10000 T: 0.834676 & 0.832038 \
20000 T: 0.836802 & 0.836633