In [19]:
import pandas as pd
import ollama
import asyncio

In [48]:
df_test = pd.read_csv("/Users/pratikhotchandani/Downloads/Github/This-week-in-football/Testing code/exploded_dataset.csv", na_filter=False)
df_test = df_test[df_test['Category'] != ""]

In [46]:
BATCH_SIZE = 100
models_list = ['llama3.1','llama3.2:1b','llama3.2:3b','phi3.5']


REDDIT_COMMENT_CLEANING_LABELS = ['Bot','Rules', 'Human-Conversation','N/A']
REDDIT_COMMENT_CLEANING_LABELS_STR = ",".join(REDDIT_COMMENT_CLEANING_LABELS)

PROMPT_COMMENT_CLEANING = {
    "name": "prompt_a",
    "content": (
        "You are a Reddit subreddit moderator whose task is to categorize comments into one of the following labels. "
        "You must output **exactly one** of these labels, whichever is the most likely. You cannot output anything other than one of these labels. "
        "If you're unsure, output **only N/A**.\n\n"
        f"Labels: {REDDIT_COMMENT_CLEANING_LABELS_STR}.\n\n"
        "Interpretation of some labels:\n\t"
        "Bot: A comment that appears to have been generated by a bot.\n\t"
        "Human-Conversation: A comment that reflects human input, offering opinions, reactions, or conversational responses with substance.\n\t"
        "N/A: Use this when a comment does not contribute insight to the conversation or context, such as short, vague statements or irrelevant replies.\n\n"
        "Output format: **one label**.\n\n"
        'Example 1: "I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/PremierLeague) if you have any questions or concerns." -> "Bot".\n\n'
        'Example 2: "After the game, you can now legitimately say that Company is doing a really good job so far." -> "Human-Conversation".\n\n'
        'Example 3: "The worst building I’ve seen with us for a long time. High and away." -> "Human-Conversation".\n\n'
        'Example 4: "No comments." -> "N/A".\n\n'
        'Example 5: "[deleted]" -> "N/A".\n\n'
        "Please output only **one label** from the list: `Bot`, `Human-Conversation`, or `N/A`. No other text should be included."
        "\n\nThe comment text to categorize is provided inside ```.\n\n"
    ),
}

In [47]:
PROMPT_COMMENT_CLEANING_2 = {
    "name": "prompt_a",
    "content": (
        "You are a Reddit subreddit moderator whose task is to categorize comments into one of the following labels. "
        "You must output **exactly one** of these labels, whichever is the most likely. You cannot output anything other than one of these labels. "
        "If you're unsure, output **only N/A**.\n\n"
        f"Labels: {REDDIT_COMMENT_CLEANING_LABELS_STR}.\n\n"
        "Interpretation of some labels:\n\t"
        "Bot: A comment that appears to have been generated by a bot.\n\t"
        "Rules: A comment that enforces subreddit rules or only references guidelines like rules, posting policies, or moderation.\n\t"
        "Human-Conversation: A comment that reflects human input, offering opinions, reactions, or conversational responses with substance.\n\t"
        "N/A: Use this when a comment does not contribute insight to the conversation or context, such as short, vague statements or irrelevant replies.\n\n"
        "Output format: **one label**.\n\n"
        'Example 1: "I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/PremierLeague) if you have any questions or concerns." -> "Bot".\n\n'
        'Example 2: "Fellow fans, this is a friendly reminder to please follow the [Rules](https://www.reddit.com/r/premierleague/about/rules) and [Reddiquette](https://support.reddithelp.com/hc/en-us/articles/205926439-Reddiquette). Please also make sure to [Join us on Discord](https://discord.gg/football)." -> "Rules".\n\n'
        'Example 3: "After the game, you can now legitimately say that Company is doing a really good job so far." -> "Human-Conversation".\n\n'
        'Example 4: "The worst building I’ve seen with us for a long time. High and away." -> "Human-Conversation".\n\n'
        'Example 5: "No comments." -> "N/A".\n\n'
        "Please output only **one label** from the list: `Bot`, `Rules`, `Human-Conversation`, or `N/A`. No other text should be included."
        "\n\nThe comment text to categorize is provided inside ```.\n\n"
    ),
}


In [27]:
async def classify_comments_for_cleaning(prompt, models, df):
    try:
        # Ensure prompt is a non-empty string
        if not isinstance(prompt["content"], str) or not prompt["content"].strip():
            raise ValueError("Prompt must be a non-empty string.")

        base_string = prompt["content"]

        # Iterate over the list of models
        for model in models:
            responses = []

            # Iterate over DataFrame rows in batches
            for i in range(0, len(df), BATCH_SIZE):
                print(f"Processing comments from row: {i} with model: {model} ...")

                for row in df[i:i+BATCH_SIZE].itertuples(index=True):
                    prompt_text = base_string + f"comment: '''{row.comments}'''"
                    print("Final prompt is: ", prompt_text)
                    print("Generating response for model .... ")

                    # Send the custom prompt to the model
                    response = ollama.generate(
                        model=model,
                        prompt=prompt_text
                    )

                    # Extract the 'response' from the model output and store it
                    responses.append(response.get('response', ''))

            # Add the model responses as a new column in the original DataFrame
            # Column name is based on the model
            df[f"{model}_response"] = pd.Series(responses)

        return df

    except ValueError as ve:
        return f"Input Error: {str(ve)}"
    except KeyError as ke:
        return f"Response Error: {str(ke)}"
    except Exception as e:
        return f"Unexpected Error: {str(e)}"


In [32]:
inferenced_df = await classify_comments_for_cleaning(PROMPT_COMMENT_CLEANING_2, models_list, df_test)

Processing comments from row: 0 with model: llama3.1 ...
Final prompt is:  You are a Reddit subreddit moderator whose task is to categorize comments into one of the following labels. You must output **exactly one** of these labels, whichever is the most likely. You cannot output anything other than one of these labels. If you're unsure, output **only N/A**.

Labels: Bot,Human-Conversation,N/A.

Interpretation of some labels:
	Bot: A comment that appears to have been generated by a bot.
	Rules: A comment that enforces subreddit rules or only references guidelines like rules, posting policies, or moderation.
	Human-Conversation: A comment that reflects human input, offering opinions, reactions, or conversational responses with substance.
	N/A: Use this when a comment does not contribute insight to the conversation or context, such as short, vague statements or irrelevant replies.

Output format: **one label**.

Example 1: "I am a bot, and this action was performed automatically. Please [

In [33]:
inferenced_df

Unnamed: 0.1,Unnamed: 0,subreddit,submission_date,submission_id,submission_type,submission_url,submission_title,no_of_upvotes,comments,Category,llama3.1_response,llama3.2:1b_response,llama3.2:3b_response,phi3.5_response
7,1560,Bundesliga,9/27/24,1fqjnh8,Link,/r/Bundesliga/comments/1fqjnh8/jens_lehmann_be...,Jens Lehmann bekennt sich schuldig im Kettensä...,132,![gif](giphy|3o7aTuy3b4TwuUSUzm),,,```Bot```,N/A.,Removed
8,1545,Bundesliga,9/27/24,1fqmw0f,Link,/r/Bundesliga/comments/1fqmw0f/schneller_und_s...,Schneller und stressfreier Einlass ins Stadion...,87,![gif](giphy|IeKgCDlpTqRQbZEhBF),,Bot,Bot,**Bot**,Bot
29,36,soccer,9/29/24,1fs7z6m,Link,/r/soccer/comments/1fs7z6m/bruno_fernandes_str...,Bruno Fernandes straight red card against Tott...,5829,[deleted],,**Human-Conversation**,Bot,**Human-Conversation**,Human-Conversation
30,511,football,9/23/24,1fnoem0,Image,/r/football/comments/1fnoem0/interesting_stat_...,Interesting stat on time wasting from goal kicks,19,[deleted],,**Human-Conversation**,Human-Conversation,**Human-Conversation**,Human-Conversation
31,330,football,9/28/24,1friiuz,Link,/r/football/comments/1friiuz/optajoe_cole_palm...,[OptaJoe] Cole Palmer is the first player in P...,215,[deleted],,Human-Conversation,`Bot`,**Human-Conversation**,Human-Conversation
40,358,football,9/27/24,1fqhkvp,Link,/r/football/comments/1fqhkvp/graham_potter_int...,Graham Potter interview: Chelsea was the perfe...,130,[removed],,,,,
41,1455,Bundesliga,9/28/24,1frhcaw,Text,/r/Bundesliga/comments/1frhcaw/tatsachenentsch...,Tatsachenentscheidung gegen nachträgliche Sperren,23,[removed],,,,,
46,1905,Bundesliga,9/23/24,1fnpnh5,Link,/r/Bundesliga/comments/1fnpnh5/lage_der_liga_d...,Lage der Liga - Der FKM-Jahresbericht - Fussba...,0,[removed],,,,,
51,0,soccer,9/29/24,1fse3w2,Link,/r/soccer/comments/1fse3w2/diego_simeone_and_h...,Diego Simeone and his players urging Atletico ...,6587,**Mirrors / Alternative Angles**\n \n\n*I am ...,Bot,,,,
52,10,soccer,9/29/24,1fsewe3,Link,/r/soccer/comments/1fsewe3/atletico_madrid_1_1...,Atletico Madrid [1] - 1 Real Madrid - Angel Co...,2687,**Mirrors / Alternative Angles**\n \n\n*I am ...,Bot,,,,


In [49]:
async def classify_comments_for_cleaning_2(prompt, model, df):
    try:
        # Ensure prompt is a non-empty string
        if not isinstance(prompt["content"], str) or not prompt["content"].strip():
            raise ValueError("Prompt must be a non-empty string.")

        base_string = prompt["content"]
        responses = []

        # Iterate over DataFrame rows in batches
        for i in range(0, len(df), BATCH_SIZE):
            print(f"Processing comments from row: {i} ...")
            
            for row in df[i:i+BATCH_SIZE].itertuples(index=True):
                prompt = base_string + f"comment: '''{row.comments}'''"
                print("Final prompt is: ", prompt)
                print("Generating llama response .... ")

                # Send the custom prompt to the LLaMA 3.1 model
                response = ollama.generate(
                    model=model,
                    prompt=prompt
                )

                # Extract the 'response' from the LLaMA output and store it
                responses.append(response.get('response', ''))

        # Add the LLaMA responses as a new column in the original DataFrame
        df['llama_response'] = pd.Series(responses)

        return df

    except ValueError as ve:
        return f"Input Error: {str(ve)}"
    except KeyError as ke:
        return f"Response Error: {str(ke)}"
    except Exception as e:
        return f"Unexpected Error: {str(e)}"


In [50]:
response = await classify_comments_for_cleaning_2(PROMPT_COMMENT_CLEANING_2, 'phi3.5', df_test)

Processing comments from row: 0 ...
Final prompt is:  You are a Reddit subreddit moderator whose task is to categorize comments into one of the following labels. You must output **exactly one** of these labels, whichever is the most likely. You cannot output anything other than one of these labels. If you're unsure, output **only N/A**.

Labels: Bot,Rules,Human-Conversation,N/A.

Interpretation of some labels:
	Bot: A comment that appears to have been generated by a bot.
	Rules: A comment that enforces subreddit rules or only references guidelines like rules, posting policies, or moderation.
	Human-Conversation: A comment that reflects human input, offering opinions, reactions, or conversational responses with substance.
	N/A: Use this when a comment does not contribute insight to the conversation or context, such as short, vague statements or irrelevant replies.

Output format: **one label**.

Example 1: "I am a bot, and this action was performed automatically. Please [contact the mod

In [51]:
response

Unnamed: 0.1,Unnamed: 0,subreddit,submission_date,submission_id,submission_type,submission_url,submission_title,no_of_upvotes,comments,Category,llama_response
7,1560,Bundesliga,9/27/24,1fqjnh8,Link,/r/Bundesliga/comments/1fqjnh8/jens_lehmann_be...,Jens Lehmann bekennt sich schuldig im Kettensä...,132,![gif](giphy|3o7aTuy3b4TwuUSUzm),,
8,1545,Bundesliga,9/27/24,1fqmw0f,Link,/r/Bundesliga/comments/1fqmw0f/schneller_und_s...,Schneller und stressfreier Einlass ins Stadion...,87,![gif](giphy|IeKgCDlpTqRQbZEhBF),,Bot
29,36,soccer,9/29/24,1fs7z6m,Link,/r/soccer/comments/1fs7z6m/bruno_fernandes_str...,Bruno Fernandes straight red card against Tott...,5829,[deleted],,Human-Conversation
30,511,football,9/23/24,1fnoem0,Image,/r/football/comments/1fnoem0/interesting_stat_...,Interesting stat on time wasting from goal kicks,19,[deleted],,Human-Conversation
31,330,football,9/28/24,1friiuz,Link,/r/football/comments/1friiuz/optajoe_cole_palm...,[OptaJoe] Cole Palmer is the first player in P...,215,[deleted],,Human-Conversation
40,358,football,9/27/24,1fqhkvp,Link,/r/football/comments/1fqhkvp/graham_potter_int...,Graham Potter interview: Chelsea was the perfe...,130,[removed],,
41,1455,Bundesliga,9/28/24,1frhcaw,Text,/r/Bundesliga/comments/1frhcaw/tatsachenentsch...,Tatsachenentscheidung gegen nachträgliche Sperren,23,[removed],,
46,1905,Bundesliga,9/23/24,1fnpnh5,Link,/r/Bundesliga/comments/1fnpnh5/lage_der_liga_d...,Lage der Liga - Der FKM-Jahresbericht - Fussba...,0,[removed],,
51,0,soccer,9/29/24,1fse3w2,Link,/r/soccer/comments/1fse3w2/diego_simeone_and_h...,Diego Simeone and his players urging Atletico ...,6587,**Mirrors / Alternative Angles**\n \n\n*I am ...,Bot,
52,10,soccer,9/29/24,1fsewe3,Link,/r/soccer/comments/1fsewe3/atletico_madrid_1_1...,Atletico Madrid [1] - 1 Real Madrid - Angel Co...,2687,**Mirrors / Alternative Angles**\n \n\n*I am ...,Bot,
