## Part 1: Analyze the Fake News Dataset

In [9]:
# 1: Import Dataset

import nltk
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
from nltk.tokenize import word_tokenize
from cleantext import clean

nltk.download('punkt_tab')

# 1: Read the CSV file and save the original
df = pd.read_csv("news_sample.csv")

# Save the original raw text before cleaning
df['raw_content'] = df['content']

def clean_text_lib(text):
   return clean(text,
               lower=True,
               no_line_breaks=True,
               no_urls=True,
               no_emails=True,
               no_numbers=True,
               no_punct=True)
    
# 2: Clean the data 
df['clean_content'] = df['content'].apply(lambda x: clean_text_lib(x) if isinstance(x, str) else "")

# 3: get all raw data from 'content'
raw_text = " ".join(df['content'].dropna().tolist())
raw_tokens = word_tokenize(raw_text)
unique_raw_words = set(raw_tokens)
print("unique words in raw text:", len(unique_raw_words))

# Get all cleaned text from the new "clean_content" 
clean_text_all = " ".join(df['clean_content'].dropna().tolist())
clean_tokens = word_tokenize(clean_text_all)
unique_clean_words = set(clean_tokens)
print("unique words in cleaned text:", len(unique_clean_words))

# number of each word in cleaned text
word_freq = Counter(clean_tokens)
most_common_50 = word_freq.most_common(50)

# Extract words and frequencies
words, frequencies = zip(*most_common_50)

# Plot the 50 most frequent words
#plt.figure(figsize=(15, 5))
#plt.bar(words, frequencies)
#plt.xlabel("Words")
#plt.ylabel("Frequency")
#plt.title("50 Most frequent words in cleaned text")
#plt.xticks(rotation=90)
#plt.tight_layout()
#plt.show()





# 2: Dataset Analysis

# A: Determine which article types should be omitted, if any.

# leave out satire (just humor)
df = df[df['type'] != 'satire']
# Has no data
df = df[df['type'] != 'state']
# misleading science
df = df[df['type'] != 'junksci']
# not reliable, just hate
df = df[df['type'] != 'hate']
# exaggerated
df = df[df['type'] != 'clickbait']
# specefic viewpoint
df = df[df['type'] != 'political']



# B: Group the remaining types into 'fake' and 'reliable'. Argue for your choice.

# fake
df['type'] = df['type'].replace(['conspiracy', 'fake'], 'fake')


#reliable
df['type'] = df['type'].replace(['unreliable', 'bias', 'unreliable', 'unknown'], 'reliable')
print(df['type'].value_counts())

# C: Examine the percentage distribution of 'reliable' vs. 'fake' articles. Is the dataset balanced? Discuss the importance of a balanced distribution.

# reliable vs fake - percentage
print(df['type'].value_counts(normalize=True))


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Mariu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


unique words in raw text: 20948
unique words in cleaned text: 16608
type
fake        186
reliable     21
Name: count, dtype: int64
type
fake        0.898551
reliable    0.101449
Name: proportion, dtype: float64


## Part 2: Gathering Links

In [10]:
# 1: Library installation
import requests
from bs4 import BeautifulSoup

# 2: Retrieve HTML Content
response = requests.get('https://www.bbc.com/news/world/europe')
contents = response.text

# 3: Extract Articles
soup = BeautifulSoup(contents, 'html.parser')


## Part 3: Scraping Article Text

In [11]:
# 1: Article Inspection

# 2: Text Scraping Function

# 3: Scrape All Articles

# 4: Data Storage

# 5: Discussion

## Part 4: Preservation
Keep the data that you have scraped so you can use it for your Group Project!