## **FAKE NEWS CLASSIFICATION**
---

### DATA COLLECTION

In [1]:
import xml.etree.ElementTree as ET  
from urllib.error import HTTPError
import urllib.request
import threading
import datetime
import random
import time
import csv
import pandas as pd


In [2]:
# kaggle dataset for trainng
df = pd.read_csv('WELFake_Dataset.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [4]:
# web scrapping data from CNN and BBC which can be used for testing model

# Date and random seed
now = datetime.datetime.now()
curr_date = f'{now.day}/{now.month}/{now.year}'
random.seed(now)

# URLs for BBC and CNN
BBC_URLS = ('world', 'uk', 'business', 'politics', 'health',
            'education', 'science_and_environment', 'technology',
            'entertainment_and_arts', 'world/africa')

CNN_URLS = ('edition', 'edition_world', 'edition_africa', 'edition_americas',
            'edition_asia', 'edition_golf', 'edition_motorsport', 'edition_tennis')

def fetch_articles(directory, site):
    """Fetch articles from BBC or CNN RSS feed."""
    try:
        url = f'http://feeds.bbci.co.uk/news/{directory}/rss.xml' if site == 'BBC' else f'http://rss.cnn.com/rss/{directory}.rss'
        tree = ET.parse(urllib.request.urlopen(url))
    except (HTTPError, ET.ParseError) as e:
        print(f"Error: {e}")
        return None
    
    return [elem.text for elem in tree.getroot().iter('title') if elem.text]

def save_to_csv(articles, directory, site, error=False):
    """Save articles to a CSV file."""
    filename = 'errorLog.csv' if error else f'{site}Articles.csv'
    with open(filename, 'a', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['date', 'dir', 'title'], lineterminator='\n')
        for article in articles:
            writer.writerow({'date': curr_date, 'dir': directory, 'title': article})

def scrape(directory, site):
    """Scrape articles from a specific site and directory."""
    articles = fetch_articles(directory, site)
    if articles:
        save_to_csv(articles, directory, site)
        print(f'Scraped articles from {site} - {directory}')
    else:
        save_to_csv([f'Error: Could not scrape {directory}'], directory, site, error=True)
        print(f'Failed to scrape {directory}')

def bbc_scrape():
    """Scrape articles from BBC."""
    for url in BBC_URLS:
        scrape(url, 'BBC')
        time.sleep(random.uniform(1, 3))

def cnn_scrape():
    """Scrape articles from CNN."""
    for url in CNN_URLS:
        scrape(url, 'CNN')
        time.sleep(random.uniform(1, 3))



since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(now)


In [5]:
# Run scraping functions in separate threads
def run_scrapers():
    bbc_thread = threading.Thread(target=bbc_scrape)
    cnn_thread = threading.Thread(target=cnn_scrape)
    
    bbc_thread.start()
    cnn_thread.start()
    
    bbc_thread.join()
    cnn_thread.join()

run_scrapers()


Scraped articles from BBC - world
Scraped articles from BBC - uk
Scraped articles from CNN - edition
Scraped articles from BBC - business
Scraped articles from CNN - edition_world
Scraped articles from BBC - politics
Scraped articles from CNN - edition_africa
Scraped articles from BBC - health
Scraped articles from CNN - edition_americas
Scraped articles from BBC - education
Scraped articles from CNN - edition_asia
Scraped articles from BBC - science_and_environment
Scraped articles from CNN - edition_golf
Scraped articles from BBC - technology
Scraped articles from CNN - edition_motorsport
Scraped articles from BBC - entertainment_and_arts
Scraped articles from CNN - edition_tennis
Scraped articles from BBC - world/africa


In [6]:
# Load scraped data into DataFrames
bbc_df = pd.read_csv('BBCArticles.csv')
cnn_df = pd.read_csv('CNNArticles.csv')\

# Display the first few rows of each DataFrame
bbc_df.head()


Unnamed: 0,24/7/2024,world,BBC News
0,24/7/2024,world,BBC News
1,24/7/2024,world,Kamala Harris attacks Trump over 'fear and hat...
2,24/7/2024,world,Graphic footage shows US officers stood over b...
3,24/7/2024,world,Drenched in blood - how Bangladesh protests tu...
4,24/7/2024,world,"Gaza release deal has to happen now, hostage's..."


In [7]:
cnn_df.head()

Unnamed: 0,24/7/2024,edition,CNN.com - RSS Channel - App International Edition
0,24/7/2024,edition,CNN.com - RSS Channel - App International Edition
1,24/7/2024,edition,Trump pleads not guilty to 34 felony counts
2,24/7/2024,edition,Haberman reveals why Trump attacked judge and ...
3,24/7/2024,edition,What to know about the Trump indictment on the...
4,24/7/2024,edition,READ: Trump indictment related to hush money p...


In [8]:
# Rename columns for consistency and clarity
cnn_df.rename(columns={'edition': 'Title', 'CNN.com - RSS Channel - App International Edition': 'Text', '24/7/2024': 'Date'}, inplace=True)
bbc_df.rename(columns={'world': 'Title', 'BBC News': 'Text', '24/7/2024': 'Date'}, inplace=True)

# Add unique identifiers
cnn_df['ID'] = ['CNN_' + str(i) for i in range(len(cnn_df))]
bbc_df['ID'] = ['BBC_' + str(i) for i in range(len(bbc_df))]


# Concatenate the DataFrames
Web_scrap = pd.concat([bbc_df[['ID', 'Title', 'Text', 'Date' ]],
                         cnn_df[['ID', 'Title', 'Text', 'Date']]], 
                        ignore_index=True)

Web_scrap.head(-5)

Unnamed: 0,ID,Title,Text,Date
0,BBC_0,world,BBC News,24/7/2024
1,BBC_1,world,Kamala Harris attacks Trump over 'fear and hat...,24/7/2024
2,BBC_2,world,Graphic footage shows US officers stood over b...,24/7/2024
3,BBC_3,world,Drenched in blood - how Bangladesh protests tu...,24/7/2024
4,BBC_4,world,"Gaza release deal has to happen now, hostage's...",24/7/2024
...,...,...,...,...
674,CNN_277,edition_tennis,'Like a freight train once we got hot': How 't...,24/7/2024
675,CNN_278,edition_tennis,French Open crowd boos as Ukraine's Marta Kost...,24/7/2024
676,CNN_279,edition_tennis,With Rafael Nadal absent and Iga Światek's dom...,24/7/2024
677,CNN_280,edition_tennis,Tennis player Mikael Ymer disqualified from ma...,24/7/2024


### DATA PREPROCESSING

In [9]:
# Clean column names
Web_scrap.columns = Web_scrap.columns.str.strip().str.lower().str.replace(' ', '_')

# Check for missing values
Web_scrap.isnull().sum()
df.isnull().sum()

# Handle missing values by filling with empty strings or removing rows
Web_scrap.fillna('', inplace=True)
df.drop_duplicates(inplace=True)
df['text'] = df['text'].fillna('')
df['title'] = df['title'].fillna('')

In [10]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download stopwords if not already present
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and stem tokens
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    # Rejoin tokens into a single string
    return ' '.join(tokens)


# Apply preprocessing to the text column
Web_scrap['text'] = Web_scrap['text'].apply(preprocess_text)
df['text'] = df['text'].apply(preprocess_text)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:

from sklearn.feature_extraction.text import TfidfVectorizer
# Combine title and text for feature extraction
df['content'] = df['title'] + ' ' + df['text']

# Convert labels to integers if they are not already
df['label'] = df['label'].astype(int)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['content'])
y = df['label']

# Verify the shape and type of the data
print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("Feature names:", vectorizer.get_feature_names_out()[:10])  # Print the first 10 feature names


Features shape: (72134, 5000)
Target shape: (72134,)
Feature names: ['000' '2016' '_____' 'aaron' 'abandon' 'abba' 'abbott' 'abc' 'abdullah'
 'abe']


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


# Predictions
y_pred = model.predict(X_test)

In [14]:
# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      7089
           1       0.94      0.95      0.95      7338

    accuracy                           0.95     14427
   macro avg       0.95      0.95      0.95     14427
weighted avg       0.95      0.95      0.95     14427

Confusion Matrix:
[[6664  425]
 [ 357 6981]]


In [15]:
# Save the Web_scrap DataFrame to a CSV file
web_scrap_file_path = 'WebScrap_Processed.csv'
Web_scrap.to_csv(web_scrap_file_path, index=False, encoding='utf-8')

# Save the processed Kaggle dataset to a CSV file
processed_kaggle_file_path = 'WELFake_Processed.csv'
df.to_csv(processed_kaggle_file_path, index=False, encoding='utf-8')
