In [35]:
# Load DF
import pandas as pd
df = pd.read_csv('/Users/vinayakkannan/Desktop/Projects/FactChecker/FactChecker/Testing/Test Output/analysis/actual-Table 1.csv')
# Filter df 'Orig Cluster' column where it equals '64'
df = df[df['Orig Cluster'] == 64]
# df = df[df['Orig Cluster'] != -1]
df = df[df['cluster'] != -1]

In [36]:
df

Unnamed: 0.1,Unnamed: 0,text,veracity,predict,predicted_veracity,cluster,num_correct_in_cluster,total_in_cluster,cluster_accuracy,Correct Pred,Cluster count 50,Diff,Orig Cluster
3729,752,"In 2022, reported emissions from large industr...",3.0,True,3.0,78,8,8,1.0,True,120,112,64
3730,1362,"Between 1990 and 2023, annual emissions of SO2...",3.0,False,3.0,78,8,8,1.0,True,120,112,64
3731,1522,"Compared to 2022, the 2023 data show a 15% dec...",3.0,False,3.0,78,8,8,1.0,True,120,112,64
3732,2423,"Nationwide, emissions for 2023 show the most s...",3.0,False,3.0,78,8,8,1.0,True,120,112,64
3733,2428,"Net US greenhouse gas emissions were 5,489 mil...",3.0,False,3.0,78,8,8,1.0,True,120,112,64
3734,2627,Reported power plant emissions decreased by 0....,3.0,False,3.0,78,8,8,1.0,True,120,112,64
3735,3328,There is a 28.7% decrease in emissions since 2...,3.0,False,3.0,78,8,8,1.0,True,120,112,64
3736,3640,"While complying with programs to reduce SO2, N...",3.0,False,3.0,78,8,8,1.0,True,120,112,64
3737,379,According to the Paris-based International Ene...,1.0,True,1.0,79,27,27,1.0,True,120,93,64
3738,389,This suggests a second threshold question: whe...,1.0,True,1.0,79,27,27,1.0,True,120,93,64


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tfidf(cluster):
    # Get all the text in the cluster
    cluster_text = df[df['cluster'] == cluster]['text']
    # Create a tfidf vectorizer that removes stop words and lowercases all words
    vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
    # Fit the vectorizer to the text
    X = vectorizer.fit_transform(cluster_text)
    # Get the feature names
    feature_names = vectorizer.get_feature_names_out()
    # Get the tfidf values
    tfidf = X.toarray()
    # Get the sum of the tfidf values
    tfidf_sum = tfidf.sum(axis=0)
    # Create a dictionary of the feature names and their tfidf values
    tfidf_dict = dict(zip(feature_names, tfidf_sum))
    # Sort the dictionary by tfidf values
    tfidf_dict = dict(sorted(tfidf_dict.items(), key=lambda item: item[1], reverse=True))
    return tfidf_dict

# Get the tfidf for each cluster
tfidf_dict = {}
for cluster in df['cluster'].unique():
    tfidf_dict[cluster] = get_tfidf(cluster)

In [38]:
# Get the top 10 tfidf words and scores for each cluster. Create a df that contains each claim, the cluster it belongs to, and the top 10 tfidf words and scores
top_tfidf = pd.DataFrame(columns=['Claim', 'Cluster', 'Top 10 Words', 'Top 10 Scores'])
for cluster in df['cluster'].unique():
    tfidf = tfidf_dict[cluster]
    top_words = list(tfidf.keys())[:10]
    top_scores = list(tfidf.values())[:10]
    claims = df[df['cluster'] == cluster]['text']
    for i in range(len(claims)):
        top_tfidf.loc[len(top_tfidf)] = [claims.iloc[i], cluster, top_words, top_scores]

In [39]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

api_key = os.getenv("OPEN_AI_KEY")
client = OpenAI(api_key=api_key)

results = []

# Iterate over each cluster
for cluster in df['cluster'].unique():
    # Get the top 10 tfidf words for the cluster
    tfidf = tfidf_dict[cluster]
    top_words = list(tfidf.keys())[:10]

    # Use OpenAI to generate a summary for the cluster
    response = client.chat.completions.create(
         model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": f""" Write a one sentence description for a topic cluster of claims that is as 
                    different as possible from other topic clusters. All topic clusters are related to climate 
                    change. Your topic cluster name should be as different from the other topic clusters which are
                    listed below:
                    {", ".join(results)}
                    
                    Up to 10 of the cluster's claims are: 
                    {", ".join(top_words)}
                    
                    Your one sentence description should be at most 10 words and use the TF-IDF terms. Do not write more than 10 words.
                    """
                },
            ]
    ).choices[0].message.content
    
    results.append(response)

    # Print the summary
    print(f"Summary for cluster {cluster}: {response}")
    
    # Add the summary to the df
    df.loc[df['cluster'] == cluster, 'summary'] = response
    
output_df = df

Summary for cluster 78: Air pollutants and their impact on environmental health.
Summary for cluster 79: Global carbon emissions trends and impacts on the environment.
Summary for cluster 80: Impact of countries' emissions cuts under the Kyoto Protocol.
Summary for cluster 81: European Union's Emissions Trading System and national policies.


In [40]:
df

Unnamed: 0.1,Unnamed: 0,text,veracity,predict,predicted_veracity,cluster,num_correct_in_cluster,total_in_cluster,cluster_accuracy,Correct Pred,Cluster count 50,Diff,Orig Cluster,summary
3729,752,"In 2022, reported emissions from large industr...",3.0,True,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...
3730,1362,"Between 1990 and 2023, annual emissions of SO2...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...
3731,1522,"Compared to 2022, the 2023 data show a 15% dec...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...
3732,2423,"Nationwide, emissions for 2023 show the most s...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...
3733,2428,"Net US greenhouse gas emissions were 5,489 mil...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...
3734,2627,Reported power plant emissions decreased by 0....,3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...
3735,3328,There is a 28.7% decrease in emissions since 2...,3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...
3736,3640,"While complying with programs to reduce SO2, N...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...
3737,379,According to the Paris-based International Ene...,1.0,True,1.0,79,27,27,1.0,True,120,93,64,Global carbon emissions trends and impacts on ...
3738,389,This suggests a second threshold question: whe...,1.0,True,1.0,79,27,27,1.0,True,120,93,64,Global carbon emissions trends and impacts on ...


In [41]:
 # Join all the claims into one string
claims = ", ".join(df['text'])

 
response = client.chat.completions.create(
     model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": f""" Write a one sentence description for a topic cluster of claims. All topic clusters are related to climate  change. 
                
                The claims are as folows: {claims}
                
                Your one sentence description should be at most 10 words. Do not write more than 10 words. Make the description as specfiic as posssible to the claims.
                """
            },
        ]
).choices[0].message.content

print(response)

Global emissions reductions and changes by year from 1990-2023.


In [42]:
df = pd.read_csv('/Users/vinayakkannan/Desktop/Projects/FactChecker/FactChecker/Testing/Test Output/analysis/actual-Table 1.csv')
df = df[df['Orig Cluster'] != -1]

def get_tfidf(cluster):
    # Get all the text in the cluster
    cluster_text = df[df['Orig Cluster'] == cluster]['text']
    # Create a tfidf vectorizer that removes stop words and lowercases all words
    vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
    # Fit the vectorizer to the text
    X = vectorizer.fit_transform(cluster_text)
    # Get the feature names
    feature_names = vectorizer.get_feature_names_out()
    # Get the tfidf values
    tfidf = X.toarray()
    # Get the sum of the tfidf values
    tfidf_sum = tfidf.sum(axis=0)
    # Create a dictionary of the feature names and their tfidf values
    tfidf_dict = dict(zip(feature_names, tfidf_sum))
    # Sort the dictionary by tfidf values
    tfidf_dict = dict(sorted(tfidf_dict.items(), key=lambda item: item[1], reverse=True))
    return tfidf_dict

# Get the tfidf for each cluster
tfidf_dict = {}
for cluster in df['Orig Cluster'].unique():
    tfidf_dict[cluster] = get_tfidf(cluster)
    
# Get the top 10 tfidf words and scores for each cluster. Create a df that contains each claim, the cluster it belongs to, and the top 10 tfidf words and scores
top_tfidf = pd.DataFrame(columns=['Claim', 'Cluster', 'Top 10 Words', 'Top 10 Scores'])
for cluster in df['Orig Cluster'].unique():
    tfidf = tfidf_dict[cluster]
    top_words = list(tfidf.keys())[:10]
    top_scores = list(tfidf.values())[:10]
    claims = df[df['Orig Cluster'] == cluster]['text']
    for i in range(len(claims)):
        top_tfidf.loc[len(top_tfidf)] = [claims.iloc[i], cluster, top_words, top_scores]   

In [43]:
# Filter df 'Orig Cluster' column where it equals '64'
df = df[df['Orig Cluster'] == 64]
output = ""

for cluster in df['Orig Cluster'].unique():
    # Get the top 10 tfidf words for the cluster
    tfidf = tfidf_dict[cluster]
    top_words = list(tfidf.keys())[:10]

    # Use OpenAI to generate a summary for the cluster
    response = client.chat.completions.create(
         model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": f""" Write a one sentence description for a topic cluster of claims that is as 
                    different as possible from other topic clusters. All topic clusters are related to climate 
                    change. Your topic cluster name should be as different from the other topic clusters which are
                    listed below:
                    {", ".join(results)}
                    
                    Up to 10 of the cluster's claims are: 
                    {", ".join(top_words)}
                    
                    Your one sentence description should be at most 10 words and use the TF-IDF terms. Do not write more than 10 words.
                    """
                },
            ]
    ).choices[0].message.content
    
    results.append(response)

    # Print the summary
    print(f"Summary for cluster {cluster}: {response}")
    
    output = response
    
    # Add the summary to the df
    df.loc[df['cluster'] == cluster, 'summary'] = response

Summary for cluster 64: Global greenhouse gas emissions trends and impact on countries.


In [44]:
# Set output_df 'Orig Cluster description' column where it equals '64' to output
output_df.loc[output_df['Orig Cluster'] == 64, 'Orig Cluster description'] = output

In [45]:
output_df

Unnamed: 0.1,Unnamed: 0,text,veracity,predict,predicted_veracity,cluster,num_correct_in_cluster,total_in_cluster,cluster_accuracy,Correct Pred,Cluster count 50,Diff,Orig Cluster,summary,Orig Cluster description
3729,752,"In 2022, reported emissions from large industr...",3.0,True,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...,Global greenhouse gas emissions trends and imp...
3730,1362,"Between 1990 and 2023, annual emissions of SO2...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...,Global greenhouse gas emissions trends and imp...
3731,1522,"Compared to 2022, the 2023 data show a 15% dec...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...,Global greenhouse gas emissions trends and imp...
3732,2423,"Nationwide, emissions for 2023 show the most s...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...,Global greenhouse gas emissions trends and imp...
3733,2428,"Net US greenhouse gas emissions were 5,489 mil...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...,Global greenhouse gas emissions trends and imp...
3734,2627,Reported power plant emissions decreased by 0....,3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...,Global greenhouse gas emissions trends and imp...
3735,3328,There is a 28.7% decrease in emissions since 2...,3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...,Global greenhouse gas emissions trends and imp...
3736,3640,"While complying with programs to reduce SO2, N...",3.0,False,3.0,78,8,8,1.0,True,120,112,64,Air pollutants and their impact on environment...,Global greenhouse gas emissions trends and imp...
3737,379,According to the Paris-based International Ene...,1.0,True,1.0,79,27,27,1.0,True,120,93,64,Global carbon emissions trends and impacts on ...,Global greenhouse gas emissions trends and imp...
3738,389,This suggests a second threshold question: whe...,1.0,True,1.0,79,27,27,1.0,True,120,93,64,Global carbon emissions trends and impacts on ...,Global greenhouse gas emissions trends and imp...
