In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [2]:
import os
import glob

def get_csv_filenames(folder_path):
    # Get a list of all CSV files in the folder
    file_pattern = os.path.join(folder_path, "*.csv")
    csv_files = glob.glob(file_pattern)
    
    # Extract filenames from the file paths
    csv_filenames = [os.path.basename(file_path) for file_path in csv_files]
    
    return csv_filenames

In [419]:
files = get_csv_filenames('Scraped_news')

articles = pd.DataFrame()

for file in files:
    article = pd.read_csv(f'Scraped_news/{file}', encoding='utf-8',  converters={'Paragraphs': literal_eval})
    articles = pd.concat([articles, article], axis=0)

In [420]:
articles = articles.drop(columns='Unnamed: 0')

In [421]:
def remove_empty_strings(lst):
    return [item for item in lst if (item.strip() != '')]

articles['Paragraphs'] = articles['Paragraphs'].apply(remove_empty_strings)

In [422]:
articles = articles.drop_duplicates(subset=['Title'])

In [423]:
articles = articles.drop(article[article['Paragraphs'].apply(len) == 0].index)
articles = articles.reset_index(drop=True)

In [428]:
data = {"not_trash": articles.iloc[:, 0].tolist()}
with open ("trash.txt", "r") as f:
    trash = f.read().splitlines()
    data["trash"] = trash
data

{'not_trash': ["Biden unveils massive $7.3T budget with $5.5T in tax hikes, plans for 'highest burden' in US history",
  'Biden’s Budget Calls for Tax Increases on Corporations and the Wealthy',
  'Biden denounces Trump for $2tn tax cuts as he unveils budget plan',
  "Biden budget would cut taxes for millions and restore breaks for families. Here's what to know.",
  "Opinion | Our 'Capitalist,' 'Not Anticorporation' President in Action",
  'Biden proposes 2% federal pay raise in 2025 budget request',
  "Biden's budget and the limits of taxing rich Americans",
  'Biden’s annual budget hits populist economic themes as general election campaign kicks off',
  "Biden's budget proposal: Family tax breaks, cheaper health care, more",
  "The 7 biggest fights Biden's new budget picks with Republicans",
  "U.S. President Again Proposes Crypto Mining Tax, 'Wash Sale Rule' for Digital Assets in New Budget",
  "Biden's budget proposal for a second term offers tax breaks for families and lower healt

In [3]:
import spacy
import classy_classification

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
nlp = spacy.blank("en")
nlp.add_pipe(
    "classy_classification", 
    config={
        "data": data, 
        "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        "device": "gpu"
    }
)

NameError: name 'data' is not defined

In [431]:
def clean_article(article):
    clean = []
    for sentence in article:
        doc = nlp(sentence)
        if doc._.cats["trash"] > .95:
            continue
        else:
            clean.append(sentence)
    return clean

In [432]:
articles['Paragraphs'] =  articles['Paragraphs'].apply(clean_article)

In [433]:
#articles.to_csv('cleaned_articles.csv')

In [434]:
sentences = []

for index, row in articles.iterrows():    
    sentences += row['Paragraphs']

In [445]:
import random

random.shuffle(sentences)

In [5]:
from transformers import pipeline

In [448]:
sentiment_analyzer = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [469]:
pos_neg_sentences = {
    'Sentence' : [],
    'Label' : [],
    'Score' : []
}

for sentence in sentences:
    try:
        if (len(sentence.split(' ')) <= 50) & (len(sentence.split(' ')) >= 6):
            sent_results = sentiment_analyzer(sentence)
            
            if len(sent_results) > 0 and sent_results[0]['label'] != 'NEU':
                pos_neg_sentences['Sentence'].append(sentence)
                pos_neg_sentences['Label'].append(sent_results[0]['label'])
                pos_neg_sentences['Score'].append(sent_results[0]['score'])
    except Exception as e:
        print(f"Error processing sentence: '{sentence}'")
        print(f"Error message: {e}")
    if (len(pos_neg_sentences['Sentence']) == 500):
        break

In [470]:
pos_neg_sentences_df = pd.DataFrame(pos_neg_sentences)
#pos_neg_sentences_df.to_csv('pos_neg_sentences.csv')

In [9]:
import nltk
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

def extract_phrases(text):
    doc = nlp(text)
    phrases = set()

    # Extract noun chunks
    for chunk in doc.noun_chunks:
        phrases.add(chunk.text)
    
    # Extract verb phrases using dependency parsing
    for token in doc:
        if token.pos_ == 'VERB':
            verb_phrase = ' '.join([child.text for child in token.children if child.dep_ in {'aux', 'neg', 'advmod'}] + [token.text])
            phrases.add(verb_phrase)
    
    # Extract additional phrases using patterns
    matcher = Matcher(nlp.vocab)
    patterns = [
        [{"POS": "ADJ"}, {"POS": "NOUN"}],  # Adjective + Noun
        [{"POS": "NOUN"}, {"POS": "NOUN"}],  # Noun + Noun
        [{"POS": "ADV"}, {"POS": "VERB"}],   # Adverb + Verb
    ]
    matcher.add("PhrasePatterns", patterns)
    
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        phrases.add(span.text)

    return list(phrases)

In [536]:
phrases_words = {
    'phrases_words' : []
}

for index, row in pos_neg_sentences_df.iloc[0:].iterrows():
    phrases = extract_phrases(row['Sentence'])
    words = row['Sentence'].split(' ')

    for word in words:
        if (word not in phrases) and (word.lower() not in stop_words):
            phrases.append(word)
    phrases_words['phrases_words'].append(phrases)

In [537]:
phrases_words_df = pd.DataFrame(phrases_words)
pos_neg_sentences_df = pd.concat([pos_neg_sentences_df, phrases_words_df], axis=1)

In [539]:
#pos_neg_sentences_df.to_csv('sentences_and_phrases.csv')

In [8]:
pos_neg_sentences_df = pd.read_csv('pos_neg_sentences.csv')

In [18]:
from IPython.display import clear_output, display

pd.set_option('display.max_colwidth', 1000)

sentence_encodings = {
    'Sentence' : [],
    'Sentence Encodings' : []
}

for index, row in pos_neg_sentences_df.iloc[51:].iterrows():
    
    phrases = extract_phrases(row['Sentence'])
    words = row['Sentence'].split(' ')
    
    for word in words:
        if (word not in phrases) and (word.lower() not in stop_words):
            phrases.append(word)
    clear_output(wait=True)       
    print(row['Sentence'])
    print(row['Label'])
    word_importance = {}
    for word in phrases:
        print(word)
        answer = input()
        word_importance[word] = answer
    sentence_encodings['Sentence'].append(row['Sentence'])
    sentence_encodings['Sentence Encodings'].append(word_importance)
    sentence_encodings_df = pd.DataFrame(sentence_encodings)
    sentence_encodings_df.to_csv('sentence_encodings5.csv')

Using GPT-4o it is instantly clear how much faster it is than the earlier models, including GPT-3.5, which is much smaller and less capable. It can also analyze video content, which is something not previously possible in ChatGPT or any mainstream chatbot.
POS
also analyze


KeyboardInterrupt: Interrupted by user

In [524]:
pos_neg_sentences_df.loc[3]

Sentence    “Too many people denying, downplaying, rationalizing, ignoring horrors of the Holocaust and October 7, including Hamas’s appalling use of sexual violence, torture and terrorizing,” he said. “It’s absolutely despicable and must stop.”
Label                                                                                                                                                                                                                                             NEG
Score                                                                                                                                                                                                                                        0.977382
Name: 3, dtype: object

In [284]:
news = pd.read_csv('harris-running-mate.csv', encoding='utf-8',  converters={'Paragraphs': literal_eval})

In [285]:
def remove_empty_strings(lst):
    return [item for item in lst if (item.strip() != '')]

news['Paragraphs'] = news['Paragraphs'].apply(remove_empty_strings)

In [286]:
news = news.drop(news[news['Paragraphs'].apply(len) == 0].index)
news = news.reset_index(drop=True)

In [287]:
data = {"not_trash": news.iloc[:, 0].tolist()}
with open ("trash.txt", "r") as f:
    trash = f.read().splitlines()
    data["trash"] = trash
data

{'not_trash': ["Pennsylvania Gov. Josh Shapiro among 2 leading candidates to be Harris' running mate: ABC News",
  'Democratic VP contender Josh Shapiro made his name battling Trump in court as Pennsylvania AG',
  'Here Are the State Delegations That Have Endorsed Kamala Harris',
  'Pennsylvania voters react to Biden’s endorsement of Harris',
  'Tracking how many Democratic delegates are backing Kamala Harris',
  "How Kamala Harris' Christian faith differs from Donald Trump's Christian nationalism",
  'Who is Pennsylvania Gov. Josh Shapiro?',
  '‘Deeply personal decision’: Calls for Harris to pick Pa. Gov. Shapiro as running mate swell, as he stays silent',
  "Shapiro on potential VP slot: 'Not going to engage in hypotheticals'",
  'Pa. delegation to Democratic convention endorses Kamala Harris for president',
  'Biden’s hometown residents weigh in on president’s decision to exit 2024 race',
  'La Plata County residents react to Biden dropping out of presidential race',
  "CNN slammed 

In [291]:
news['Paragraphs'] =  news['Paragraphs'].apply(clean_article)

In [292]:
news

Unnamed: 0,Title,Link,Paragraphs
0,Pennsylvania Gov. Josh Shapiro among 2 leading...,https://news.google.com/articles/CBMiXWh0dHBzO...,[PHILADELPHIA (WPVI) -- Pennsylvania Governor ...
1,Democratic VP contender Josh Shapiro made his ...,https://news.google.com/articles/CBMigAFodHRwc...,[WASHINGTON — Former President Donald Trump co...
2,Here Are the State Delegations That Have Endor...,https://news.google.com/articles/CBMiZmh0dHBzO...,"[In a show of party unity, state convention de..."
3,Pennsylvania voters react to Biden’s endorseme...,https://news.google.com/articles/CBMiTmh0dHBzO...,"[Voters in Pennsylvania, a swing state, reacte..."
4,Tracking how many Democratic delegates are bac...,https://news.google.com/articles/CBMiV2h0dHBzO...,"[Harris needs 1,976 votes to secure the nomina..."
5,How Kamala Harris' Christian faith differs fro...,https://news.google.com/articles/CBMiZmh0dHBzO...,[A potential Trump v. Harris race puts two com...
6,Who is Pennsylvania Gov. Josh Shapiro?,https://news.google.com/articles/CBMijgFodHRwc...,[HARRISBURG — Pennsylvania Gov. Josh Shapiro h...
7,‘Deeply personal decision’: Calls for Harris t...,https://news.google.com/articles/CBMiOmh0dHBzO...,[Although Gov. Shapiro won’t answer whether he...
8,Shapiro on potential VP slot: 'Not going to en...,https://news.google.com/articles/CBMiQ2h0dHBzO...,[In his first public appearance since endorsin...
9,Pa. delegation to Democratic convention endors...,https://news.google.com/articles/CBMigAFodHRwc...,[HARRISBURG — Pennsylvania’s delegation to the...


In [293]:
'''
nlptown/bert-base-multilingual-uncased-sentiment

finiteautomata/bertweet-base-sentiment-analysis

distilbert-base-uncased-finetuned-sst-2-english
'''

'\nnlptown/bert-base-multilingual-uncased-sentiment\n\nfiniteautomata/bertweet-base-sentiment-analysis\n\ndistilbert-base-uncased-finetuned-sst-2-english\n'

In [294]:
from transformers import pipeline

In [295]:
sentiment_analyzer = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def get_dynamic_threshold(score, factor_high, factor_medium, factor_low, factor_lowest):
        if score > 0.9:
            return score * factor_high
        elif score > 0.7:
            return score * factor_medium
        elif score > 0.6:
            return score * factor_low
        else:
            return score * factor_lowest

def generate_explanation(sentence, factor_high=0.01, factor_medium=0.1, factor_low=0.15, factor_lowest=0.2, stop_words=stop_words):
    sent_results = sentiment_analyzer(sentence)
    sent_label = sent_results[0]['label']
    sent_score = sent_results[0]['score']

    phrases = extract_phrases(sentence)
    words = sentence.split(' ')
    
    for word in words:
        if (word not in phrases) and (word.lower() not in stop_words):
            phrases.append(word)
    
    
    important = ''
    amount = 0
    print(sent_score)
    for phrase in phrases:
        new_sent = sentence
        new_sent = new_sent.replace(phrase, "")
        results = sentiment_analyzer(new_sent)
        new_score = results[0]['score']
        print(new_score, " ", phrase)
        if results[0]['label'] != sent_label:
            print("YEP")
            if ((sent_score > 0.6) & (new_score > 0.6)):
                amount += 1
                important += '"'
                important += phrase
                important += '," '
        elif (sent_score - new_score) > get_dynamic_threshold(sent_score, factor_high, factor_medium, factor_low, factor_lowest):
            print("Diff: ", (sent_score - new_score), " and threshold: ", get_dynamic_threshold(sent_score, factor_high, factor_medium, factor_low, factor_lowest))
            amount += 1
            important += '"'
            important += phrase
            important += '," '
                   
    important = important[:-3] + important[-3 + 1:]
    important = important[:-1]
    if amount >= 2:
        important = important.split(" ")
        important.insert(len(important) + -1, 'and')
        important = " ".join(important)
        sing_plur = 'words/phrases'
    else:
        sing_plur = 'word/phrase'


    label_mapping = {'NEU' : 'neutral', 'POS' : 'positive', 'NEG' : 'negative'}
    explanation = f"This sentence is {label_mapping[sent_label]} because it contains the {sing_plur} {important}."
    return explanation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\17028\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\17028\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
article = news.loc[1][2]
for sentence in article:
    sent_results = sentiment_analyzer(sentence)
    sent_label = sent_results[0]['label']
    if sent_label == 'NEU':
        continue
    else:
        print(sentence)
        print(generate_explanation(sentence))
        print()

NameError: name 'news' is not defined

In [None]:
#Get multiple articles and convert them to sentences.
#For each sentence put through the sentiment analysis
#For only take the positive and negative ones
#Then for each of these sentences, create a vector of the words in the sentence
#For each word put a 1 if it contributes to the pos/neg rating and a 0 if not

#Use some model to change the parameters that are being used (thresholds)
#Try different functions (Dynamic threshold/exponential/linear/etc.) and see which one has the highest accuracy
#Measure it as a percentage of the words it got right rather than a completely right or completely wrong. Or try both!