<a href="https://colab.research.google.com/github/Pawan-1809/Fake_news_prediction_model/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary


In [None]:
import csv
tsv_file = '/content/test.tsv'
csv_file = '/content/test.csv'

with open(tsv_file, 'r', newline='', encoding='utf-8') as tsv:
    tsv_reader = csv.reader(tsv, delimiter='\t')

    with open(csv_file, 'w', newline='', encoding='utf-8') as csv_f:
        csv_writer = csv.writer(csv_f)
        for row in tsv_reader:
            csv_writer.writerow(row)


In [None]:
tsv_file = '/content/train.tsv'
csv_file = '/content/train.csv'

with open(tsv_file, 'r', newline='', encoding='utf-8') as tsv:
    tsv_reader = csv.reader(tsv, delimiter='\t')

    with open(csv_file, 'w', newline='', encoding='utf-8') as csv_f:
        csv_writer = csv.writer(csv_f)
        for row in tsv_reader:
            csv_writer.writerow(row)


In [None]:
df = pd.read_csv('/content/train.csv')

In [None]:
df['text'] = df['text'].str.lower().str.replace('[^\w\s]', '')
df['title'] = df['title'].str.lower().str.replace('[^\w\s]', '')

In [None]:
df = df.dropna(subset=['label'])

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,2619,ex-cia head says trump remarks on russia inter...,former cia director john brennan on friday cri...,politicsNews,"July 22, 2017",1
1,16043,you wonâ€™t believe his punishment! hispanic sto...,how did this man come to own this store? there...,Government News,"Jun 19, 2017",0
2,876,federal reserve governor powell's policy views...,president donald trump on thursday tapped fede...,politicsNews,"November 2, 2017",1
3,19963,scoundrel hillary supporter starts â€œtrumpleaks...,hillary clinton ally david brock is offering t...,left-news,"Sep 17, 2016",0
4,10783,nancy pelosi arrogantly dismisses questions on...,pleading ignorance is a perfect ploy for nancy...,politics,"May 26, 2017",0


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
df['tokens'] = df['text'].apply(word_tokenize)

In [None]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
sia = SentimentIntensityAnalyzer()
df['sentiment_score'] = df['text'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [None]:
df['title_length'] = df['title'].apply(len)
df['text_length'] = df['text'].apply(len)

In [None]:
dictionary = Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

In [None]:
lda = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary)
df['topic_features'] = [lda.get_document_topics(corpus[i]) for i in range(len(corpus))]

In [None]:
tfidf = TfidfVectorizer(max_features=10000)
tfidf_matrix = tfidf.fit_transform(df['text'])

In [None]:
X = np.hstack((tfidf_matrix.toarray(),df[['sentiment_score', 'title_length', 'text_length']].values))
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

In [None]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {auc}")

Accuracy: 0.9861666666666666
Precision: 0.9870357393132446
Recall: 0.9839329374781698
F1 Score: 0.9854818960993528
AUC-ROC: 0.9992663032495211


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=2
)


In [None]:
#grid_search.fit(X_train, y_train)
#best_model = grid_search.best_estimator_
#print("Best Parameters:", grid_search.best_params_)
#output: Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
est_model = RandomForestClassifier(
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)

In [None]:
#using lightgbm for better performance

In [None]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, random_state=42)
lgbm_model.fit(X_train, y_train)

y_pred = lgbm_model.predict(X_test)
y_prob = lgbm_model.predict_proba(X_test)[:, 1]
'''output of this:
[LightGBM] [Info] Number of positive: 11659, number of negative: 12341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.373536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 827726
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 9917
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.485792 -> initscore=-0.056849
[LightGBM] [Info] Start training from score -0.056849
'''


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Number of positive: 11659, number of negative: 12341
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 6.197569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 827726
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 9917
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.485792 -> initscore=-0.056849
[LightGBM] [Info] Start training from score -0.056849




'output of this:\n[LightGBM] [Info] Number of positive: 11659, number of negative: 12341\n[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.373536 seconds.\nYou can set `force_col_wise=true` to remove the overhead.\n[LightGBM] [Info] Total Bins 827726\n[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 9917\n[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.485792 -> initscore=-0.056849\n[LightGBM] [Info] Start training from score -0.056849\n'

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {auc}")

Accuracy: 0.9953333333333333
Precision: 0.994074590449634
Recall: 0.9961578763534754
F1 Score: 0.9951151430565248
AUC-ROC: 0.9998259704042798


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
train_idx, test_idx = train_test_split(df.index, test_size=0.2, random_state=42)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = NewsDataset(df['text'].iloc[train_idx].values, y_train.values)
test_dataset = NewsDataset(df['text'].iloc[test_idx].values, y_test.values)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
''' Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()
'''

'# Trainer\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=train_dataset,\n    eval_dataset=test_dataset\n)\n\n# Train model\ntrainer.train()\n'

In [None]:
!pip install flask




In [None]:
import joblib

joblib.dump(model, 'final_model.pkl')

joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [None]:
from flask import Flask, request, jsonify
import joblib

model = joblib.load('final_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    news_text = data.get('text', '')

    if not news_text:
        return jsonify({'error': 'No text provided'}), 400

    processed_text = vectorizer.transform([news_text])

    prediction = model.predict(processed_text)
    prediction_proba = model.predict_proba(processed_text)[0]

    response = {
        'prediction': int(prediction[0]),
        'confidence': {
            'fake': prediction_proba[0],
            'real': prediction_proba[1]
        }
    }
    return jsonify(response)

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
