In [8]:
 # Install the opendatasets package
!pip install opendatasets

import opendatasets as od
import pandas as pd

# Download the dataset
od.download("https://www.kaggle.com/datasets/bhavikjikadara/fake-news-detection")

# Load the dataset into a DataFrame
df = pd.read_csv('fake-news-detection/fake.csv')
print(df.head())

Skipping, found downloaded files in "./fake-news-detection" (use force=True to force download)
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  


In [9]:
# Install necessary libraries
!pip install nltk

import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lower case
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Example usage
sample_text = "This is an example of a news article! It's full of misinformation."
clean_text = preprocess_text(sample_text)
print(clean_text)

example news article full misinformation


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Install necessary libraries
!pip install scikit-learn

from sklearn.feature_extraction.text import TfidfVectorizer

# Function to extract features using TF-IDF
def extract_features(texts):
    vectorizer = TfidfVectorizer(max_features=5000)
    features = vectorizer.fit_transform(texts)
    return features, vectorizer

# Example usage
texts = [clean_text, "Another example of a news article."]
features, vectorizer = extract_features(texts)
print(features.toarray())

[[0.         0.37930349 0.37930349 0.53309782 0.53309782 0.37930349
  0.        ]
 [0.53309782 0.37930349 0.37930349 0.         0.         0.37930349
  0.53309782]]


In [11]:
# Install necessary libraries
!pip install scikit-learn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Function to train a model
def train_model(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    return model, accuracy

# Example usage with a mock dataset
features = np.array([[0.1, 0.2], [0.2, 0.1], [0.3, 0.4], [0.4, 0.3], [0.5, 0.6], [0.6, 0.5]])
labels = np.array([0, 1, 0, 1, 0, 1])  # 1 for fake, 0 for real

model, accuracy = train_model(features, labels)
print(f'Model accuracy: {accuracy}')

Model accuracy: 1.0


In [12]:
# Define a list of trusted sources
trusted_sources = ['cnn.com', 'bbc.com', 'nytimes.com']

# Function to verify source credibility
def verify_source(source_url):
    for trusted in trusted_sources:
        if trusted in source_url:
            return True
    return False

# Example usage
source_url = 'https://cnn.com/news/article'
is_trusted = verify_source(source_url)
print(f'Source trusted: {is_trusted}')


Source trusted: True


In [13]:
vectorizer_path = '/path/to/your/vectorizer.pkl'
model_path = '/path/to/your/model.pkl'
# Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pickle

# Example dataset
texts = ["This is fake news", "This is real news"]
labels = [1, 0]  # 1 = Fake, 0 = Real

# Train vectorizer and model
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
model = LogisticRegression()
model.fit(X, labels)

# Save the vectorizer and model
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)


In [14]:
import re
import pickle

# Function to load the vectorizer and model
def load_model_and_vectorizer(vectorizer_path='vectorizer.pkl', model_path='model.pkl'):
    with open(vectorizer_path, 'rb') as vec_file:
        vectorizer = pickle.load(vec_file)
    with open(model_path, 'rb') as model_file:
        model = pickle.load(model_file)
    return vectorizer, model

# Function to clean text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

# Function to detect fake news
def detect_fake_news(article_text):
    vectorizer, model = load_model_and_vectorizer()
    clean_text_data = clean_text(article_text)
    features = vectorizer.transform([clean_text_data])  # Ensure same vectorizer is used
    prediction = model.predict(features)[0]
    is_fake = prediction == 1

    return {
        'text': article_text,
        'is_fake': is_fake,
    }

# Example usage
example_text = "This is a sample news article to test the detection system."
result = detect_fake_news(example_text)
print(result)


{'text': 'This is a sample news article to test the detection system.', 'is_fake': False}
