<a href="https://colab.research.google.com/github/Veronikkkka/NLP_fake_news_classifier/blob/main/nlp_categories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import re

def analyze_numbers_in_text(article_text):
    """
    Analyze the use of numbers in a given text and return a score.

    Parameters:
    - article_text (str): The text of the article to analyze.

    Returns:
    - dict: A dictionary containing:
        - 'number_count': Total number of numerical tokens in the text.
        - 'unique_number_count': Count of unique numbers.
        - 'average_number_length': Average length of numerical tokens.
        - 'number_score': A weighted score based on number analysis.
    """
    number_pattern = r'\b\d+(?:\.\d+)?\b'
    numbers = re.findall(number_pattern, article_text)

    number_count = len(numbers)
    # print(numbers)
    unique_numbers = set(numbers)
    unique_number_count = len(unique_numbers)

    average_number_length = (
        sum(len(num) for num in numbers) / number_count if number_count > 0 else 0
    )

    number_score = (
        0.5 * number_count + 0.3 * unique_number_count + 0.2 * average_number_length
    )

    return {
        "number_count": number_count,
        "unique_number_count": unique_number_count,
        "average_number_length": average_number_length,
        "number_score": number_score
    }


In [12]:

article = """
The population of the city is approximately 1.2 million as of 2023.
In 2020, it was 1.15 million. The area is 234.5 square kilometers.
"""
result = analyze_numbers_in_text(article)
print(result)

['1.2', '2023', '2020', '1.15', '234.5']
{'number_count': 5, 'unique_number_count': 5, 'average_number_length': 4.0, 'number_score': 4.8}


In [17]:
from enum import unique
import re

def analyze_hyperlinks_in_text(article_text):
    """
    Analyze the presence of hyperlinks in a given text and return a score.

    Parameters:
    - article_text (str): The text of the article to analyze.

    Returns:
    - dict: A dictionary containing:
        - 'hyperlink_count': Total number of hyperlinks in the text.
        - 'unique_hyperlink_count': Count of unique hyperlinks.
        - 'hyperlink_score': A normalized score (0 to 1) based on hyperlink analysis.
    """
    hyperlink_pattern = r'\b(?:http[s]?://|www\.)\S+\b'
    hyperlinks = re.findall(hyperlink_pattern, article_text)

    # Total number of hyperlinks
    hyperlink_count = len(hyperlinks)

    unique_hyperlinks = set(hyperlinks)
    unique_hyperlink_count = len(unique_hyperlinks)

    MAX_HYPERLINK_COUNT = 50  # Assumes a reasonable max count of links in an article
    MAX_UNIQUE_LINK_COUNT = 50  # Assumes a reasonable max unique links in an article

    # Normalized metrics
    # normalized_hyperlink_count = min(hyperlink_count / MAX_HYPERLINK_COUNT, 1.0)
    # normalized_unique_hyperlink_count = min(unique_hyperlink_count / MAX_UNIQUE_LINK_COUNT, 1.0)

    # Weighted score calculation
    hyperlink_score = (
        0.7 * hyperlink_count +
        0.3 * unique_hyperlink_count
    )

    return {
        "hyperlink_count": hyperlink_count,
        "unique_hyperlink_count": unique_hyperlink_count,
        "hyperlink_score": hyperlink_score
    }


In [18]:

# Example usage
article = """
This article includes some links like https://example.com and http://test.com.
You can also visit www.website.org for more information.
Another link is https://example.com/details.
"""
result = analyze_hyperlinks_in_text(article)
print(result)

{'hyperlink_count': 4, 'unique_hyperlink_count': 4, 'hyperlink_score': 4.0}


In [12]:
from textblob import TextBlob
from collections import Counter
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import re

nltk.download("stopwords")
nltk.download("wordnet")

def estimate_emotionality(article_text):
    """
    Estimate the emotionality of a news article.

    Parameters:
    - article_text (str): The text of the article to analyze.

    Returns:
    - dict: A dictionary containing:
        - 'sentiment_polarity': Sentiment polarity (-1 to 1).
        - 'sentiment_subjectivity': Sentiment subjectivity (0 to 1).
        - 'emotion_words_count': Count of words associated with emotions.
        - 'emotionality_score': Normalized score (0 to 1) reflecting emotionality.
    """
    stop_words = set(stopwords.words("english"))
    words = [
        word.lower()
        for word in re.findall(r'\b\w+\b', article_text)
        if word.lower() not in stop_words
    ]

    # Sentiment analysis using TextBlob
    blob = TextBlob(article_text)
    sentiment_polarity = blob.sentiment.polarity
    sentiment_subjectivity = blob.sentiment.subjectivity
    # print(sentiment_subjectivity, type(sentiment_subjectivity))
    emotion_words = {
        "happy", "joy", "sad", "anger", "fear", "love", "hate", "excited",
        "nervous", "calm", "anxious", "worried", "delighted", "cry", "laugh"
    }
    emotion_count = Counter(word for word in words if word in emotion_words)
    emotion_words_count = sum(emotion_count.values())
    # Normalized emotionality score
    # MAX_POLARITY = 1.0  # Maximum absolute polarity
    # MAX_SUBJECTIVITY = 1.0  # Maximum subjectivity
    # MAX_EMOTION_WORDS = 50  # Reasonable max for emotion word count in typical news

    # normalized_polarity = abs(sentiment_polarity) / MAX_POLARITY
    # normalized_subjectivity = sentiment_subjectivity / MAX_SUBJECTIVITY
    # normalized_emotion_words = min(emotion_words_count / MAX_EMOTION_WORDS, 1.0)

    emotionality_score = (
        0.4 * sentiment_polarity +
        0.3 * sentiment_subjectivity +
        0.3 * emotion_words_count
    )

    return {
        "sentiment_polarity": sentiment_polarity,
        "sentiment_subjectivity": sentiment_subjectivity,
        "emotion_words_count": emotion_words_count,
        "emotionality_score": emotionality_score
    }



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
article = """
Breaking news: A massive earthquake has caused widespread devastation in the region.
Many are anxious about the aftermath, while others are expressing love and solidarity.
"""
result = estimate_emotionality(article)
print(result)

{'sentiment_polarity': 0.1875, 'sentiment_subjectivity': 0.775, 'emotion_words_count': 2, 'emotionality_score': 0.9075}
