Step 1: Pre‑processing Functions and Feature Extraction
This step includes functions to tokenize, remove stop words, apply stemming, and lemmatize text. It also includes a helper to extract features as a bag‑of‑words.

In [1]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess(text):
    """
    Process text by:
      - Converting to lowercase and tokenizing.
      - Removing punctuation and stop words.
      - Applying stemming and lemmatization.
    Returns a list of cleaned tokens.
    """
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in string.punctuation]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens_stemmed = [stemmer.stem(word) for word in tokens]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens_lemmatized = [lemmatizer.lemmatize(word) for word in tokens_stemmed]
    return tokens_lemmatized

def extract_features(words):
    """
    Convert a list of words into a feature dictionary.
    Each word is a key with the value True.
    """
    return {word: True for word in words}

# Example usage:
if __name__ == "__main__":
    sample_text = "This is a sample text to demonstrate pre-processing using NLTK."
    print("Processed tokens:", preprocess(sample_text))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/majidtavakoli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/majidtavakoli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/majidtavakoli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processed tokens: ['sampl', 'text', 'demonstr', 'pre-process', 'use', 'nltk']


Step 2: Data Collection from Wikipedia
This part uses the wikipedia library to fetch content from selected Wikipedia pages. Adjust the titles to suit your needs
Add the follwing and change the code to get higher accuracy!
Search-Based Expansion:
Uses the wikipedia.search() function with multiple queries to fetch more pages for each class.

Category-Based Expansion:
Uses the MediaWiki API directly to get pages from specific Wikipedia categories.

In [2]:
import wikipedia
import requests
import random

# Make sure you include the get_wikipedia_content function with improved error handling.
def get_wikipedia_content(page_title):
    """
    Fetches and returns the content of a Wikipedia page by its title.
    Disables auto-suggest to avoid misspelled titles.
    If a PageError occurs, attempts to use wikipedia.search() to find a correct title.
    """
    try:
        page = wikipedia.page(page_title, auto_suggest=False)
        return page.content
    except wikipedia.DisambiguationError as e:
        print(f"Disambiguation error for '{page_title}'. Options: {e.options}")
        try:
            page = wikipedia.page(e.options[0], auto_suggest=False)
            return page.content
        except Exception as ex:
            print(f"Error fetching disambiguated page '{e.options[0]}': {ex}")
            return ""
    except wikipedia.PageError as e:
        print(f"Page error for '{page_title}': {e}")
        search_results = wikipedia.search(page_title)
        if search_results:
            try:
                page = wikipedia.page(search_results[0], auto_suggest=False)
                return page.content
            except Exception as ex:
                print(f"Error fetching page '{search_results[0]}': {ex}")
                return ""
        return ""
    except Exception as e:
        print(f"Error fetching page '{page_title}': {e}")
        return ""

# ----------------------------------------
# Option 1: Expanded Dataset Using Searches
# ----------------------------------------
def collect_dataset_expanded_search(num_pages=5):
    """
    Collects a larger dataset by performing searches on Wikipedia using specific keywords.
    Retrieves pages for geographic and non-geographic topics.
    """
    dataset = []
    
    # Define search queries for each category
    geo_queries = ["city", "country", "mountain", "river", "geography", "capital"]
    non_geo_queries = ["technology", "philosophy", "art", "science", "history", "literature"]
    
    # Retrieve geographic pages using search queries
    for query in geo_queries:
        geo_titles = wikipedia.search(query, results=num_pages)
        for title in geo_titles:
            content = get_wikipedia_content(title)
            if content:
                snippet = content[:1000]  # Use the first 1000 characters as a sample
                dataset.append((snippet, "geographic"))
    
    # Retrieve non-geographic pages using search queries
    for query in non_geo_queries:
        non_geo_titles = wikipedia.search(query, results=num_pages)
        for title in non_geo_titles:
            content = get_wikipedia_content(title)
            if content:
                snippet = content[:1000]
                dataset.append((snippet, "non-geographic"))
    
    random.shuffle(dataset)
    return dataset

# ------------------------------------------
# Option 2: Expanded Dataset Using Categories
# ------------------------------------------
def get_category_members(category, limit=20):
    """
    Fetches member page titles of a Wikipedia category using the MediaWiki API.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "categorymembers",
        "cmtitle": f"Category:{category}",
        "cmlimit": limit,
        "format": "json"
    }
    response = requests.get(url, params=params)
    data = response.json()
    members = data.get("query", {}).get("categorymembers", [])
    return [member["title"] for member in members]

def collect_dataset_expanded_category(geo_category="Cities", non_geo_category="Technology", limit=20):
    """
    Collects a dataset based on Wikipedia categories.
    For geographic texts, use a category like "Cities".
    For non-geographic texts, use a category like "Technology" or any other appropriate one.
    """
    dataset = []
    
    # Get geographic pages from the specified category
    geo_titles = get_category_members(geo_category, limit)
    for title in geo_titles:
        content = get_wikipedia_content(title)
        if content:
            snippet = content[:1000]
            dataset.append((snippet, "geographic"))
    
    # Get non-geographic pages from the specified category
    non_geo_titles = get_category_members(non_geo_category, limit)
    for title in non_geo_titles:
        content = get_wikipedia_content(title)
        if content:
            snippet = content[:1000]
            dataset.append((snippet, "non-geographic"))
    
    random.shuffle(dataset)
    return dataset

# ------------------------------------------
# How to Apply These in Your Project (Step 2)
# ------------------------------------------
def collect_dataset():
    """
    Combine both search-based and category-based approaches to expand your dataset.
    Adjust parameters as needed.
    """
    dataset = []
    
    # Option 1: Collect using search queries
    search_dataset = collect_dataset_expanded_search(num_pages=5)
    print(f"Collected {len(search_dataset)} samples from search-based queries.")
    dataset.extend(search_dataset)
    
    # Option 2: Collect using category-based queries
    category_dataset = collect_dataset_expanded_category(geo_category="Cities", non_geo_category="Technology", limit=10)
    print(f"Collected {len(category_dataset)} samples from category-based queries.")
    dataset.extend(category_dataset)
    
    # Shuffle and return the combined dataset
    random.shuffle(dataset)
    return dataset

# Example usage:
if __name__ == "__main__":
    expanded_dataset = collect_dataset()
    print("Collected expanded dataset samples:")
    for text, label in expanded_dataset:
        print(f"Label: {label} | Text snippet: {text[:100]}...")




  lis = BeautifulSoup(html).find_all('li')


Disambiguation error for 'City (disambiguation)'. Options: ['City (novel)', 'City (magazine)', 'City: Magazine International', 'The City (website)', 'City (newspaper)', 'City Newspaper', '"The City" (short story)', 'Al-Balad', 'City (manga)', 'City (journal)', 'Cities (journal)', 'City (Client album)', 'City (Jane Siberry album)', 'City (Strapping Young Lad album)', 'Deserts Chang', 'Cities (Anberlin album)', 'Cities (The Cat Empire album)', 'City (band)', 'The City (band)', 'Carole King', 'City (Weapons of Peace song)', 'Swan Songs', 'Little Voice', '"Cities" (song)', 'Nat & Alex Wolff', 'Sound Shapes', 'Cities', 'Ai Furihata', 'Mystery To Me', 'City, Australian Capital Territory', 'City (Zürich)', 'City, Powys', 'City, Vale of Glamorgan', 'City of London', 'City (TV series)', 'Citytv', 'CITY-DT', 'Honda City', 'City Airline', 'Think City', 'Aixam City', 'City (artwork)', 'City (typeface)', 'City Interactive', 'Manchester City F.C.', 'The City (disambiguation)', 'Citi (disambiguation)

Step 3: Training the Naive Bayes Classifier Using NLTK
This segment preprocesses the collected data, extracts features, splits the dataset, trains a Naive Bayes classifier, and evaluates its performance.

In [3]:
import nltk

def train_naive_bayes(dataset):
    """
    Preprocess texts and extract features, then split the data into training and test sets.
    Train a Naive Bayes classifier using NLTK and print its accuracy.
    """
    # Convert each (text, label) pair into (features, label)
    featuresets = [(extract_features(preprocess(text)), label) for (text, label) in dataset]

    # Use 80% of the data for training and 20% for testing
    split_index = int(len(featuresets) * 0.8)
    train_set = featuresets[:split_index]
    test_set = featuresets[split_index:]

    nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
    accuracy = nltk.classify.accuracy(nb_classifier, test_set)
    print("Naive Bayes Classifier Accuracy:", accuracy)
    print("Most Informative Features:")
    nb_classifier.show_most_informative_features(5)
    return nb_classifier

# Example usage:
if __name__ == "__main__":
    dataset = collect_dataset()
    if dataset:
        train_naive_bayes(dataset)


Disambiguation error for 'City (disambiguation)'. Options: ['City (novel)', 'City (magazine)', 'City: Magazine International', 'The City (website)', 'City (newspaper)', 'City Newspaper', '"The City" (short story)', 'Al-Balad', 'City (manga)', 'City (journal)', 'Cities (journal)', 'City (Client album)', 'City (Jane Siberry album)', 'City (Strapping Young Lad album)', 'Deserts Chang', 'Cities (Anberlin album)', 'Cities (The Cat Empire album)', 'City (band)', 'The City (band)', 'Carole King', 'City (Weapons of Peace song)', 'Swan Songs', 'Little Voice', '"Cities" (song)', 'Nat & Alex Wolff', 'Sound Shapes', 'Cities', 'Ai Furihata', 'Mystery To Me', 'City, Australian Capital Territory', 'City (Zürich)', 'City, Powys', 'City, Vale of Glamorgan', 'City of London', 'City (TV series)', 'Citytv', 'CITY-DT', 'Honda City', 'City Airline', 'Think City', 'Aixam City', 'City (artwork)', 'City (typeface)', 'City Interactive', 'Manchester City F.C.', 'The City (disambiguation)', 'Citi (disambiguation)

Step 4: Training the Logistic Regression Classifier Using scikit‑learn
This section converts the feature dictionaries into vectors using scikit‑learn’s DictVectorizer, then trains and evaluates a Logistic Regression classifier

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_logistic_regression(dataset):
    """
    Preprocess texts, extract features, vectorize them, and then train a Logistic Regression classifier.
    Evaluates the classifier and prints the accuracy.
    """
    features = [extract_features(preprocess(text)) for (text, label) in dataset]
    labels = [label for (text, label) in dataset]

    vectorizer = DictVectorizer(sparse=False)
    X = vectorizer.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    lr_classifier = LogisticRegression(max_iter=200)
    lr_classifier.fit(X_train, y_train)
    y_pred = lr_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Logistic Regression Classifier Accuracy:", accuracy)
    return lr_classifier

# Example usage:
if __name__ == "__main__":
    dataset = collect_dataset()
    if dataset:
        train_logistic_regression(dataset)


Disambiguation error for 'City (disambiguation)'. Options: ['City (novel)', 'City (magazine)', 'City: Magazine International', 'The City (website)', 'City (newspaper)', 'City Newspaper', '"The City" (short story)', 'Al-Balad', 'City (manga)', 'City (journal)', 'Cities (journal)', 'City (Client album)', 'City (Jane Siberry album)', 'City (Strapping Young Lad album)', 'Deserts Chang', 'Cities (Anberlin album)', 'Cities (The Cat Empire album)', 'City (band)', 'The City (band)', 'Carole King', 'City (Weapons of Peace song)', 'Swan Songs', 'Little Voice', '"Cities" (song)', 'Nat & Alex Wolff', 'Sound Shapes', 'Cities', 'Ai Furihata', 'Mystery To Me', 'City, Australian Capital Territory', 'City (Zürich)', 'City, Powys', 'City, Vale of Glamorgan', 'City of London', 'City (TV series)', 'Citytv', 'CITY-DT', 'Honda City', 'City Airline', 'Think City', 'Aixam City', 'City (artwork)', 'City (typeface)', 'City Interactive', 'Manchester City F.C.', 'The City (disambiguation)', 'Citi (disambiguation)