In [26]:
import json
import pandas as pd
from tqdm import tqdm
from langchain_community.llms import Ollama
from sklearn.datasets import fetch_20newsgroups

# Dataset

In [27]:
# Load the 20 newsgroups dataset
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

news_test = pd.DataFrame({'text': newsgroups_test.data, 'label': newsgroups_test.target})

news_test['label'] = news_test['label'].map(lambda x: newsgroups_test.target_names[x])

In [28]:
news_test.head(2)

Unnamed: 0,text,label
0,I am a little confused on all of the models of...,rec.autos
1,I'm not familiar at all with the format of the...,comp.windows.x


# Modeling

In [29]:
llm = Ollama(model="gemma2")

### Step 0 - Defining Topics according to the dataset

In [30]:
all_topics = [
'rec.autos (Discussions about cars and automobiles)', 
 'rec.motorcycles (Discussions about motorcycles and related topics)', 
 'rec.sport.baseball (Baseball teams, players, and games)', 
 'rec.sport.hockey (Hockey leagues, teams, and players)', 
 'soc.religion.christian (Christianity, its doctrines, and practices)', 
 'comp.sys.ibm.pc.hardware (IBM PC-compatible hardware and troubleshooting)', 
 'comp.graphics (Computer graphics, including rendering and 3D modeling)', 
 'comp.windows.x (The X Window System for graphical user interfaces on UNIX-like systems)',
 'comp.sys.mac.hardware (Apple Macintosh hardware and troubleshooting)', 
 'comp.os.ms-windows.misc (Miscellaneous topics about Microsoft Windows)', 
 'talk.politics.guns (Gun politics, legislation, and rights)', 
 'talk.politics.misc (General political discussions)', 
 'talk.politics.mideast (Politics and current events in the Middle East)', 
 'talk.religion.misc (General religious discussions)', 
 'sci.med (Topics about medical science, health, and treatments)', 
 'sci.space (Space exploration, astronomy, and related science)', 
 'sci.crypt (Cryptography, including encryption and security techniques)', 
 'sci.electronics (Electronics, circuit design, and troubleshooting)',
 'misc.forsale (Items for sale and related discussions)', 
 'alt.atheism (Debates and discussions about atheism and related topics)', 
 ]

### Step 3 - Assigning batches of dataset to Topics Generated in Step 2 and sentiment of the review

In [31]:
def assign_topics_in_batches(input_df, topics, batch_size, llm=None):
    """
    Assign topics to News Group in batches and update DataFrame directly.

    Parameters:
    input_df (pd.DataFrame): DataFrame containing Amazon reviews
    topics (str): String of topics to assign from
    batch_size (int): Number of reviews to process in each batch
    llm: The language model instance to use for assigning topics

    Returns:
    pd.DataFrame: Updated DataFrame with topic and sentiment assignments
    """
    if llm is None:
        raise ValueError("LLM instance must be provided")

    # Create a copy of the DataFrame to avoid modifying the original
    df = input_df.copy()

    # Initialize Topic and Sentiment columns with 'Unknown'
    df['Predicted Topic'] = 'Unknown'

    total_df = len(df)

    # Process reviews in batches with progress bar
    for start_idx in tqdm(range(0, total_df, batch_size), desc="Assigning topics"):
        end_idx = min(start_idx + batch_size, total_df)
        batch_news_list = df.iloc[start_idx:end_idx]
        batch_news = " ".join([f"Item {i + 1}: {review}," for i, review in enumerate(batch_news_list['text'])])
        
        # Generate prompt for current batch
        prompt_assigning_prompt = f'''You are provided with news and helping to cluster them based on the topics.
Please assign the news to the topics provided. Return only the name of the topic for the respective nwes.
News can be found in tripletick block: ```{batch_news}```
Topics to choose from: {topics}
Please return in CSV format only topics for respective reviews and nothing else. Do not use triple backtick blocks. Only output exactly as on the example below:
Example: Having an input of News1, News2, News3, News4, News5, ... NewsN
Output: Topic1, Topic2, Topic3, Topic4, Topic5, ... TopicN
'''
        # Get assignments for current batch
        result = llm.invoke(prompt_assigning_prompt, temperature=0.0)
        try:
            # Get assignments for current batch
            result = llm.invoke(prompt_assigning_prompt, temperature=0.0)
            
            # Clean and split the result
            result = result.strip()
            if result.startswith('```') and result.endswith('```'):
                result = result[3:-3].strip()
            batch_assigned_topics = [topic.strip() for topic in result.split(',')]
            
            # Make sure we have the right number of topics
            current_batch_size = len(batch_news)
            
            # Update each row individually to avoid alignment issues
            for idx, topic in enumerate(batch_assigned_topics):
                current_idx = start_idx + idx
                if current_idx < len(df):
                    df.iloc[current_idx, df.columns.get_loc('Predicted Topic')] = topic
                    
        except Exception as e:
            print(f"\nError processing batch {start_idx}-{end_idx}: {str(e)}")
            print(f"Result received: {result}")
            print(f"Batch size: {current_batch_size}")
            print(f"Number of topics received: {len(batch_assigned_topics) if 'batch_assigned_topics' in locals() else 'N/A'}")
            continue
    return df

In [32]:
news_assigned = assign_topics_in_batches(
    news_test.head(50),
    topics=str(all_topics),
    batch_size=1,
    llm=llm
)

Assigning topics: 100%|██████████| 50/50 [03:24<00:00,  4.08s/it]


In [33]:
accuracy = (news_assigned['label'] == news_assigned['Predicted Topic']).mean()

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 74.00%


In [23]:
news_assigned.to_csv('outputs/news_assigned.csv', index=False)

In [25]:
news_assigned[news_assigned['label'] != news_assigned['Predicted Topic']]

Unnamed: 0,text,label,Predicted Topic
2,"\nIn a word, yes.\n",alt.atheism,Please provide the news items so I can cluster...
4,\nI've just spent two solid months arguing tha...,talk.religion.misc,alt.atheism
6,"Dishonest money dwindles away, but he who gath...",soc.religion.christian,talk.religion.misc
7,A friend of mine managed to get a copy of a co...,soc.religion.christian,comp.sys.ibm.pc.hardware
15,"From article <C68uBG.K2w@world.std.com>, by cf...",comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware
17,"Hello,\ni'm interested in those devices too.\n...",comp.graphics,misc.forsale
26,\nIt is said that CELP vocoders can run on the...,sci.crypt,sci.electronics
29,"\n\nI don't know about Canada, but I have hear...",comp.sys.ibm.pc.hardware,rec.motorcycles
38,I have a mac LCII 4/80 purchased last august.\...,misc.forsale,comp.sys.mac.hardware
41,"Yamanari,\n\n---Hey isn't it funny how betas h...",comp.os.ms-windows.misc,comp.windows.x
