In [26]:
import pandas as pd
from urllib.parse import urlparse
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
import numpy as np

In [27]:
# Load the dataset
data = pd.read_csv('kitwe_final_Clean.csv')

In [28]:
# Step 1: Source Credibility Analysis (updated with Zambian and Kitwe sources)
reputable_sources = [
    'bbc.com', 'reuters.com', 'nytimes.com', 'cnn.com', 'guardian.com', 'npr.org', 'forbes.com',
    'bloomberg.com', 'washingtonpost.com', 'thetimes.co.uk', 'economist.com', 'wsj.com', 'cnbc.com'
]

# Including Zambian and Kitwe reputable sources
zambian_reputable_sources = reputable_sources + [
    'daily-mail.co.zm', 'times.co.zm', 'znbc.co.zm', 'flavaradioandtv.com', 'lusakatimes.com', 'kitwetimes.com'
]
suspicious_domain_pattern = re.compile(r'\.(info|lo|ru|cn|xyz|top|news|live|buzz|click|online)$')

In [29]:
def updated_check_source_credibility(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc.lower()
    if any(source in domain for source in zambian_reputable_sources):
        return 'reputable'
    elif suspicious_domain_pattern.search(domain):
        return 'suspicious'
    else:
        return 'unknown'

data['source_credibility'] = data['Link'].apply(updated_check_source_credibility)


In [30]:
# Step 2: Clickbait Analysis
def detect_clickbait(headline):
    excessive_punctuation = len(re.findall(r'[!?.]{2,}', headline)) > 0
    all_caps = headline.isupper()
    provocative_words = any(word in headline.lower() for word in ['shocking', 'unbelievable', 'you won’t believe', 'secret', 'amazing', 'incredible'])
    if excessive_punctuation or all_caps or provocative_words:
        return 'clickbait'
    else:
        return 'not_clickbait'

data['headline_type'] = data['Headlines'].apply(detect_clickbait)


In [31]:
# Step 3: Sensational Keyword Frequency Analysis
sensational_keywords = ['shocking', 'unbelievable', 'amazing', 'incredible', 'secret', 'exposed', 'you won’t believe', 'scandal', 'controversy']
def count_sensational_keywords(description):
    count = sum(description.lower().count(word) for word in sensational_keywords)
    return count

data['sensational_keyword_count'] = data['Description'].apply(count_sensational_keywords)


In [32]:
# Step 4: Topic Modeling Using LDA
count_vectorizer = CountVectorizer(max_features=300, stop_words='english')
count_data = count_vectorizer.fit_transform(data['Description'].astype(str))
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(count_data)
topic_distribution = lda.transform(count_data)
data['dominant_topic'] = topic_distribution.argmax(axis=1)

In [41]:
# Step 5: Sentiment Analysis
def get_sentiment_score_textblob(text):
    try:
        sentiment = TextBlob(text).sentiment
        return sentiment.polarity
    except Exception as e:
        return 0  # Default to neutral if there's an issue with the text

In [42]:
# 'sentiment_score' is calculated
if 'sentiment_score' not in data.columns:
    data['sentiment_score'] = data['Description'].apply(lambda x: get_sentiment_score_textblob(str(x)))

In [44]:
def check_mismatch_headline_description(row):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    combined_text = [row['Headlines'], row['Description']]
    tfidf_matrix = tfidf_vectorizer.fit_transform(combined_text)
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0] < 0.3  # Low similarity indicates a mismatch


In [45]:
def check_excessive_capitalization(text):
    words = text.split()
    capitalized_words = [word for word in words if word.isupper() and len(word) > 1]
    return len(capitalized_words) > 3  # Threshold of 3 or more fully capitalized words


In [46]:
def check_vague_author(author):
    vague_authors = ['admin', 'editor', 'newsroom', 'staff', 'unknown']
    return any(vague_name in author.lower() for vague_name in vague_authors)


In [47]:
def count_suspicious_links(description):
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', description)
    return len(urls)

In [48]:
def check_short_sensational_description(description):
    description_length = len(description)
    sensational_word_count = sum(description.lower().count(word) for word in sensational_keywords)
    return description_length < 100 and sensational_word_count > 1

In [49]:
# Apply new criteria to dataset
data['excessive_capitalization'] = data['Headlines'].apply(check_excessive_capitalization)
data['headline_description_mismatch'] = data.apply(check_mismatch_headline_description, axis=1)
data['vague_author'] = data['Author'].apply(check_vague_author)
data['suspicious_links_count'] = data['Description'].apply(count_suspicious_links)
data['short_sensational_description'] = data['Description'].apply(check_short_sensational_description)


In [54]:
# Step 7: Consolidation and Enhanced Logical Labeling
def enhanced_determine_fake_news(row):
    fake_indicators = 0

    # Checking each feature, including the newly added ones
    if row['source_credibility'] == -1:  # Suspicious source
        fake_indicators += 1
    if row['headline_type'] == 1:  # Clickbait headline
        fake_indicators += 1
    if row['sensational_keyword_count'] > 2:  # High count of sensational keywords
        fake_indicators += 1
    if row['dominant_topic'] in [1, 2]:  # If dominant topic appears to be sensational or dubious
        fake_indicators += 1
    if row['sentiment_score'] < -0.5 or row['sentiment_score'] > 0.5:  # Extreme sentiment
        fake_indicators += 1
    if row['excessive_capitalization']:  # Excessive capitalization
        fake_indicators += 1
    if row['headline_description_mismatch']:  # Headline-Description mismatch
        fake_indicators += 1
    if row['vague_author']:  # Vague author
        fake_indicators += 1
    if row['suspicious_links_count'] > 2:  # Multiple suspicious links
        fake_indicators += 1
    if row['short_sensational_description']:  # Short and sensational description
        fake_indicators += 1

    # Label as fake if 4 or more indicators are present
    return 0 if fake_indicators >= 2 else 1

In [55]:
# Apply the enhanced function to determine the final label
data['Target_final'] = data.apply(enhanced_determine_fake_news, axis=1)

In [57]:
# Save the enhanced dataset with the new criteria applied
data.to_csv('1.csv', index=False)