In [1]:
pip install pandas scikit-learn nltk streamlit


Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hI

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import string
import joblib


In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
import zipfile

with zipfile.ZipFile("real and fake news.zip", 'r') as zip_ref:
    zip_ref.extractall()


In [14]:
import pandas as pd

fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")


In [15]:
import zipfile
import pandas as pd

# Step 1: Extract zip file
with zipfile.ZipFile("real and fake news.zip", 'r') as zip_ref:
    zip_ref.extractall()

# Step 2: Read CSV files
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

# Step 3: Label the data
fake["label"] = 0  # Fake
real["label"] = 1  # Real

# Step 4: Combine and shuffle
df = pd.concat([fake, real])
df = df.sample(frac=1).reset_index(drop=True)


In [16]:
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['text'] = df['title'] + " " + df['text']
df['text'] = df['text'].apply(clean_text)


In [17]:
X = df['text']
y = df['label']

tfidf = TfidfVectorizer(max_features=5000)
X_vec = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)


In [18]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [19]:
y_pred = lr_model.predict(X_test)
print("Logistic Regression:")
print(classification_report(y_test, y_pred))

y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes:")
print(classification_report(y_test, y_pred_nb))


Logistic Regression:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4643
           1       0.98      0.99      0.99      4337

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Naive Bayes:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      4643
           1       0.94      0.94      0.94      4337

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



In [20]:
joblib.dump(lr_model, 'model_lr.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [24]:
%%writefile app.py
import streamlit as st
import joblib
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load model and vectorizer
model = joblib.load("model_lr.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

def clean_input(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

st.title("📰 Fake News Detector")

user_input = st.text_area("Paste a news article or headline")

if st.button("Classify"):
    cleaned = clean_input(user_input)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    result = "🟢 Real News" if prediction == 1 else "🔴 Fake News"
    st.subheader(f"Prediction: {result}")



Writing app.py


In [25]:
!pip install streamlit




In [29]:
from pyngrok import ngrok

# Replace the string below with your actual authtoken
ngrok.set_auth_token("2zocwa4P8nIsCuiOTe8SXOoidZc_32HQaJJxroPk19g2yBwAe")



In [33]:
!pip install --upgrade --quiet pyngrok



In [34]:
from pyngrok import ngrok

# Replace with your actual token from: https://dashboard.ngrok.com/get-started/your-authtoken
ngrok.set_auth_token("2zocwa4P8nIsCuiOTe8SXOoidZc_32HQaJJxroPk19g2yBwAe")


In [35]:
print("Authtoken set successfully.")


Authtoken set successfully.


In [36]:
# Create public URL tunnel
public_url = ngrok.connect(8501)
print("Streamlit app URL:", public_url)

# Run Streamlit app silently
!streamlit run app.py &>/dev/null &


Streamlit app URL: NgrokTunnel: "https://8e3d562cd6d0.ngrok-free.app" -> "http://localhost:8501"


In [37]:
%%writefile app.py
import streamlit as st
import joblib
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the model and TF-IDF vectorizer
model = joblib.load("model_lr.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

def clean_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Streamlit UI
st.set_page_config(page_title="Fake News Detector", layout="centered")
st.title("📰 Fake News Detection App")

user_input = st.text_area("Enter a news article or headline:")

if st.button("Predict"):
    if not user_input.strip():
        st.warning("Please enter some text.")
    else:
        cleaned = clean_text(user_input)
        vectorized = vectorizer.transform([cleaned])
        prediction = model.predict(vectorized)[0]
        confidence = model.predict_proba(vectorized)[0][prediction]

        if prediction == 1:
            st.success(f"🟢 **Real News** (Confidence: {confidence:.2f})")
        else:
            st.error(f"🔴 **Fake News** (Confidence: {confidence:.2f})")


Overwriting app.py


In [38]:
from pyngrok import ngrok


ngrok.set_auth_token("2zocwa4P8nIsCuiOTe8SXOoidZc_32HQaJJxroPk19g2yBwAe")


public_url = ngrok.connect(8501)
print("Streamlit app running at:", public_url)

!streamlit run app.py &>/dev/null &


Streamlit app running at: NgrokTunnel: "https://071905f42075.ngrok-free.app" -> "http://localhost:8501"


In [39]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load model and vectorizer
model = joblib.load("model_lr.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = ''.join([c for c in text if c not in string.punctuation])
        words = text.split()
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)
    return ""

# Streamlit UI
st.set_page_config(page_title="📰 Fake News Detection", layout="centered")
st.title("📰 Fake News Detection App")

# Section 1: Single Article
st.subheader("📌 Check a Single News Article or Headline")
user_input = st.text_area("Enter news text here")

if st.button("Predict Single"):
    cleaned = clean_text(user_input)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    confidence = model.predict_proba(vectorized)[0][prediction]
    if prediction == 1:
        st.success(f"🟢 Real News (Confidence: {confidence:.2f})")
    else:
        st.error(f"🔴 Fake News (Confidence: {confidence:.2f})")

# Section 2: Bulk CSV Upload
st.subheader("📁 Upload a CSV File with News Articles")

uploaded_file = st.file_uploader("Upload a CSV file (must have a 'text' column)", type=["csv"])

if uploaded_file:
    df = pd.read_csv(uploaded_file)

    if 'text' not in df.columns:
        st.warning("⚠️ CSV must contain a column named 'text'.")
    else:
        df['cleaned'] = df['text'].apply(clean_text)
        vectors = vectorizer.transform(df['cleaned'])
        df['prediction'] = model.predict(vectors)
        df['confidence'] = model.predict_proba(vectors).max(axis=1)
        df['label'] = df['prediction'].apply(lambda x: 'Real' if x == 1 else 'Fake')

        st.success("✅ Prediction complete!")
        st.write(df[['text', 'label', 'confidence']])


Overwriting app.py


In [40]:
from pyngrok import ngrok

# Set your ngrok authtoken
ngrok.set_auth_token("2zocwa4P8nIsCuiOTe8SXOoidZc_32HQaJJxroPk19g2yBwAe")

# Start Streamlit tunnel
public_url = ngrok.connect(8501)
print("Streamlit app running at:", public_url)

!streamlit run app.py &>/dev/null &


Streamlit app running at: NgrokTunnel: "https://0c9ab99cf240.ngrok-free.app" -> "http://localhost:8501"
