Step 1: Pre‑processing Functions and Feature Extraction
This step includes functions to tokenize, remove stop words, apply stemming, and lemmatize text. It also includes a helper to extract features as a bag‑of‑words.

In [1]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess(text):
    """
    Process text by:
      - Converting to lowercase and tokenizing.
      - Removing punctuation and stop words.
      - Applying stemming and lemmatization.
    Returns a list of cleaned tokens.
    """
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in string.punctuation]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens_stemmed = [stemmer.stem(word) for word in tokens]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens_lemmatized = [lemmatizer.lemmatize(word) for word in tokens_stemmed]
    return tokens_lemmatized

def extract_features(words):
    """
    Convert a list of words into a feature dictionary.
    Each word is a key with the value True.
    """
    return {word: True for word in words}

# Example usage:
if __name__ == "__main__":
    sample_text = "This is a sample text to demonstrate pre-processing using NLTK."
    print("Processed tokens:", preprocess(sample_text))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/majidtavakoli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/majidtavakoli/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/majidtavakoli/nltk_data...


Processed tokens: ['sampl', 'text', 'demonstr', 'pre-process', 'use', 'nltk']


Step 2: Data Collection from Wikipedia
This part uses the wikipedia library to fetch content from selected Wikipedia pages. Adjust the titles to suit your needs

In [4]:
import wikipedia
import random



def get_wikipedia_content(page_title):
    """
    Fetches and returns the content of a Wikipedia page by its title.
    Disables auto-suggest to avoid misspelled titles.
    If a PageError occurs, attempts to use wikipedia.search() to find a correct title.
    """
    try:
        # Disable auto_suggest to force the use of the exact title
        page = wikipedia.page(page_title, auto_suggest=False)
        return page.content
    except wikipedia.DisambiguationError as e:
        print(f"Disambiguation error for '{page_title}'. Options: {e.options}")
        try:
            # Choose the first option from disambiguation
            page = wikipedia.page(e.options[0], auto_suggest=False)
            return page.content
        except Exception as ex:
            print(f"Error fetching disambiguated page '{e.options[0]}': {ex}")
            return ""
    except wikipedia.PageError as e:
        print(f"Page error for '{page_title}': {e}")
        # Attempt to search for a valid title using the search API
        search_results = wikipedia.search(page_title)
        if search_results:
            try:
                page = wikipedia.page(search_results[0], auto_suggest=False)
                return page.content
            except Exception as ex:
                print(f"Error fetching page '{search_results[0]}': {ex}")
                return ""
        return ""
    except Exception as e:
        print(f"Error fetching page '{page_title}': {e}")
        return ""


def collect_dataset():
    """
    Collects a labeled dataset from Wikipedia.
    Geographic pages are labeled "geographic" and non-geographic pages as "non-geographic".
    Returns a list of (text_snippet, label) tuples.
    """
    geographic_pages = ["New York City", "Paris", "Mount Everest", "Amazon Rainforest"]
    non_geographic_pages = ["Computer Science", "Modern Art", "Philosophy", "Quantum Mechanics"]

    dataset = []
    for title in geographic_pages:
        content = get_wikipedia_content(title)
        if content:
            snippet = content[:1000]  # Use the first 1000 characters as a sample
            dataset.append((snippet, "geographic"))

    for title in non_geographic_pages:
        content = get_wikipedia_content(title)
        if content:
            snippet = content[:1000]
            dataset.append((snippet, "non-geographic"))

    random.shuffle(dataset)
    return dataset

# Example usage:
if __name__ == "__main__":
    data = collect_dataset()
    print("Collected dataset samples:")
    for text, label in data:
        print(f"Label: {label} | Text snippet: {text[:100]}...")


Collected dataset samples:
Label: non-geographic | Text snippet: Quantum mechanics is a fundamental theory that describes the behavior of nature at and below the sca...
Label: non-geographic | Text snippet: Computer science is the study of computation, information, and automation. Computer science spans th...
Label: geographic | Text snippet: Paris (French pronunciation: [paʁi] ) is the capital and largest city of France. With an estimated p...
Label: non-geographic | Text snippet: Modern art includes artistic work produced during the period extending roughly from the 1860s to the...
Label: non-geographic | Text snippet: Philosophy is a systematic study of general and fundamental questions concerning topics like existen...
Label: geographic | Text snippet: The Amazon rainforest, also called Amazon jungle or Amazonia, is a moist broadleaf tropical rainfore...
Label: geographic | Text snippet: Mount Everest, known locally as Sagarmatha or Qomolangma,  is Earth's highest mountain above se

Step 3: Training the Naive Bayes Classifier Using NLTK
This segment preprocesses the collected data, extracts features, splits the dataset, trains a Naive Bayes classifier, and evaluates its performance.

In [5]:
import nltk

def train_naive_bayes(dataset):
    """
    Preprocess texts and extract features, then split the data into training and test sets.
    Train a Naive Bayes classifier using NLTK and print its accuracy.
    """
    # Convert each (text, label) pair into (features, label)
    featuresets = [(extract_features(preprocess(text)), label) for (text, label) in dataset]

    # Use 80% of the data for training and 20% for testing
    split_index = int(len(featuresets) * 0.8)
    train_set = featuresets[:split_index]
    test_set = featuresets[split_index:]

    nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
    accuracy = nltk.classify.accuracy(nb_classifier, test_set)
    print("Naive Bayes Classifier Accuracy:", accuracy)
    print("Most Informative Features:")
    nb_classifier.show_most_informative_features(5)
    return nb_classifier

# Example usage:
if __name__ == "__main__":
    dataset = collect_dataset()
    if dataset:
        train_naive_bayes(dataset)


Naive Bayes Classifier Accuracy: 0.5
Most Informative Features:
Most Informative Features
                    area = None           non-ge : geogra =      2.8 : 1.0
                   estim = None           non-ge : geogra =      2.8 : 1.0
                     km2 = None           non-ge : geogra =      2.8 : 1.0
                 largest = None           non-ge : geogra =      2.8 : 1.0
                   major = None           non-ge : geogra =      2.8 : 1.0


Step 4: Training the Logistic Regression Classifier Using scikit‑learn
This section converts the feature dictionaries into vectors using scikit‑learn’s DictVectorizer, then trains and evaluates a Logistic Regression classifier

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_logistic_regression(dataset):
    """
    Preprocess texts, extract features, vectorize them, and then train a Logistic Regression classifier.
    Evaluates the classifier and prints the accuracy.
    """
    features = [extract_features(preprocess(text)) for (text, label) in dataset]
    labels = [label for (text, label) in dataset]

    vectorizer = DictVectorizer(sparse=False)
    X = vectorizer.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    lr_classifier = LogisticRegression(max_iter=200)
    lr_classifier.fit(X_train, y_train)
    y_pred = lr_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Logistic Regression Classifier Accuracy:", accuracy)
    return lr_classifier

# Example usage:
if __name__ == "__main__":
    dataset = collect_dataset()
    if dataset:
        train_logistic_regression(dataset)


Logistic Regression Classifier Accuracy: 1.0


Step 5: Main Function to Run All Steps
The main function ties everything together: data collection, training the Naive Bayes classifier, and training the Logistic Regression classifier.

In [None]:
def main():
    # Step 1: Collect data from Wikipedia
    print("Collecting dataset from Wikipedia...")
    dataset = collect_dataset()
    if not dataset:
        print("Dataset is empty. Exiting.")
        return

    # Step 2: Train and evaluate the Naive Bayes Classifier
    print("\nTraining Naive Bayes Classifier with NLTK...")
    nb_classifier = train_naive_bayes(dataset)

    # Step 3: Train and evaluate the Logistic Regression Classifier
    print("\nTraining Logistic Regression Classifier with scikit‑learn...")
    lr_classifier = train_logistic_regression(dataset)

if __name__ == "__main__":
    main()
