In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from bayes_opt import BayesianOptimization
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, message=".*multi_class.*")


nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adiya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adiya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv('Fake_News_Dataset.csv')
data.head()

Unnamed: 0,tweet,label
0,Our daily update is published. States reported...,real
1,Alfalfa is the only cure for COVID-19.,fake
2,President Trump Asked What He Would Do If He W...,fake
3,States reported 630 deaths. We are still seein...,real
4,This is the sixth time a global health emergen...,real


In [3]:
if 'tweet' in data.columns and 'label' in data.columns:
    data.rename(columns={'tweet': 'text', 'label': 'class'}, inplace=True)

data['class'] = data['class'].map({'real': 1, 'fake': 0})

In [4]:
data = data.sample(frac=1).reset_index(drop=True)

In [5]:
def remove_links(text):
    return re.sub(r'http\S+|www\S+', '', text)

data['text'] = data['text'].astype(str).apply(remove_links)

In [6]:
def preprocess_text(text_data):
    preprocessed_text = []
    for sentence in tqdm(text_data):
        sentence = re.sub(r'[^\w\s]', '', sentence)
        preprocessed_text.append(' '.join(token.lower() for token in sentence.split() if token not in stopwords.words('english')))
    return preprocessed_text

data['text'] = preprocess_text(data['text'].values)

100%|██████████| 2524/2524 [00:24<00:00, 103.34it/s]


In [7]:
x_train, x_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.20, random_state=42)


In [8]:
vectorization = TfidfVectorizer()
x_train = vectorization.fit_transform(x_train)
x_test = vectorization.transform(x_test)


In [9]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1,5,4,3,2, 10],
    'solver': ['liblinear']
}

In [10]:
random_search = RandomizedSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=42, n_jobs=-1)
random_search.fit(x_train, y_train)
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)


Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 2}


In [11]:
train_accuracy = accuracy_score(y_train, random_search.predict(x_train))
test_accuracy = accuracy_score(y_test, random_search.predict(x_test))

print("Logistic Regression Training Accuracy:", train_accuracy)
print("Logistic Regression Testing Accuracy:", test_accuracy)

Logistic Regression Training Accuracy: 0.9806835066864784
Logistic Regression Testing Accuracy: 0.902970297029703


In [15]:
import pickle

# Assuming random_search is your trained model and vectorization is your vectorizer
with open('model.pkl', 'wb') as f:
    pickle.dump((random_search, vectorization), f)


In [16]:
import pickle

# Load the saved model and vectorizer
with open('model.pkl', 'rb') as f:
    loaded_model, loaded_vectorizer = pickle.load(f)

def predict_news(text):
    text_vectorized = loaded_vectorizer.transform([text])
    prediction = loaded_model.predict(text_vectorized)
    if prediction == 0:
        return "Fake News ha"
    return "Fake News Nhi ha"

# Test predictions
print(predict_news("Alfalfa is the only cure for COVID"))
print(predict_news("Our daily update is published. States reported 734k tests 39k new cases and 532 deaths. Current hospitalizations fell below 30k for the first time since June 22. https://t.co/wzSYMe0Sht"))
print(predict_news("Recent reports claim that a new herbal remedy can completely cure diabetes, but medical experts warn there's no scientific evidence supporting this. Always verify health information with trusted sources before believing or sharing."))


Fake News ha
Fake News Nhi ha
Fake News ha


In [None]:
from fastapi import FastAPI, Request
import joblib

app = FastAPI()
with open('model.pkl', 'rb') as f:
    loaded_model, loaded_vectorizer = pickle.load(f)
@app.post("/predict")
async def predict_news(request: Request):
    data = await request.json()  
    text = data.get("text")

    if text:
        text_vectorized = loaded_vectorizer.transform([text])
        prediction = loaded_model.predict(text_vectorized)
        if prediction == 0:
            return {"prediction": "Fake News ha"}
        return {"prediction": "Fake News Nhi ha"}
    else:
        return {"error": "No text provided"}

FileNotFoundError: [Errno 2] No such file or directory: 'model.pkl'