<a href="https://colab.research.google.com/github/ToffertheCreator/colab_notebooks/blob/main/fake_news_detection_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install stopwordsiso



In [None]:
import stopwordsiso as stopwords
import numpy as np
import pandas as pd
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
news_dataset = pd.read_csv('/content/Fakenews_FilEng2.csv')

In [None]:
news_dataset.shape

(46464, 2)

In [None]:
news_dataset.head()

Unnamed: 0,Content,Label
0,Pollution caused by traditional cooking fuel i...,0
1,Justice Secretary Vitaliano Aguirre 2nd and Ph...,0
2,President Rodrigo Duterte on Monday night desc...,0
3,THE militant fisher folk group Pambansang Laka...,0
4,Magdalo Rep. Gary Alejano is willing to lead t...,0


In [None]:
news_dataset.isnull().sum()

Unnamed: 0,0
Content,0
Label,0


In [None]:
rows_with_nan = news_dataset[news_dataset.isna().any(axis=1)]

print(rows_with_nan)

Empty DataFrame
Columns: [Content, Label]
Index: []


In [None]:
X = news_dataset.drop(columns='Label', axis=1)
Y = news_dataset['Label']

In [None]:
print(X)
print(Y)

                                                 Content
0      Pollution caused by traditional cooking fuel i...
1      Justice Secretary Vitaliano Aguirre 2nd and Ph...
2      President Rodrigo Duterte on Monday night desc...
3      THE militant fisher folk group Pambansang Laka...
4      Magdalo Rep. Gary Alejano is willing to lead t...
...                                                  ...
46459  Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
46460  Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
46461  Michael J. de la Merced and Rachel Abrams Macy...
46462  Alex Ansary NATO, Russia To Hold Parallel Exer...
46463            David Swanson What Keeps the F-35 Alive

[46464 rows x 1 columns]
0        0
1        0
2        0
3        0
4        0
        ..
46459    0
46460    0
46461    0
46462    1
46463    1
Name: Label, Length: 46464, dtype: int64


In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.stopwords(['en', 'tl'])]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
news_dataset['Content'] = news_dataset['Content'].apply(stemming)

In [None]:
print(news_dataset['Content'])

0        pollut caus tradit cook fuel kill peopl philip...
1        justic secretari vitaliano aguirr philippin ch...
2        presid rodrigo dutert monday night silli charg...
3        milit fisher folk pambansang laka kilusang mam...
4        magdalo rep gari alejano lead charg philippin ...
                               ...                        
46459    jerom hudson rapper trump poster child white s...
46460    benjamin hoffman playoff schedul matchup odd y...
46461    michael merc rachel abram maci receiv takeov a...
46462    alex ansari nato russia hold parallel exercis ...
46463                                   david swanson aliv
Name: Content, Length: 46464, dtype: object


In [None]:
#separating the data and label
X = news_dataset['Content'].values
Y = news_dataset['Label'].values

In [None]:
print(X)

['pollut caus tradit cook fuel kill peopl philippin rate countri asia pacif region health organ philippin earn distinct region biggest death indoor air pollut report data gather health agenc close death filipino record indoor household air pollut philippin lao death popul record philippin rank death outdoor air pollut china record death mongolia air pollut lethal environment health threat region peopl middl incom countri rate incom countri region director western pacif dr shin soo statement wednesday kerosen woodth death indoor air pollut philippin kerosen wood stove fire lamp women children risk household air pollut percent popul access clean cook fuel technolog home outdoor air pollut caus ineffici energi household industri agricultur transport sector coal fire power plant air qualiti influenc geograph meteorolog season factor danger highair pollut level asia remain danger agenc estim peopl worldwid breath air level pollut prematur death air pollut western pacif region includ philipp

In [None]:
print(Y)

[0 0 0 ... 0 1 1]


In [None]:
Y.shape

(46464,)

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
# Save the vectorizer to a file
import joblib

joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

In [None]:
print(X)

  (0, 105966)	0.037262175225066196
  (0, 105852)	0.06812264738685679
  (0, 105838)	0.04452634471881445
  (0, 105804)	0.027367831838276242
  (0, 105145)	0.09441707866428634
  (0, 104931)	0.024595008821530387
  (0, 99284)	0.05762352836162504
  (0, 99102)	0.03105951498915965
  (0, 97962)	0.013422066337129035
  (0, 97634)	0.02821390314405328
  (0, 96593)	0.031097302359229733
  (0, 93438)	0.06812264738685679
  (0, 93437)	0.044328581203286754
  (0, 93273)	0.05570459706669188
  (0, 92472)	0.02028491932204797
  (0, 91481)	0.061351789164925834
  (0, 91137)	0.02187402325143066
  (0, 89246)	0.061351789164925834
  (0, 88069)	0.08657709444779971
  (0, 87955)	0.03218016810827378
  (0, 84472)	0.05849040228252333
  (0, 83800)	0.02919488983532648
  (0, 83431)	0.02256481207777118
  (0, 83384)	0.017790808002820395
  (0, 83140)	0.022930502150285283
  :	:
  (46460, 37361)	0.45694671664440273
  (46460, 9472)	0.3021357296439001
  (46461, 106751)	0.1337157155751413
  (46461, 97962)	0.09959769977920305
  (4646

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, stratify=Y, random_state=2)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9645359542769687


In [None]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9489993544222078


In [None]:
import joblib

# Save the model to a file
joblib.dump(model, 'logistic_regression_fakenews.joblib')

['logistic_regression_fakenews.joblib']

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, Y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(Y_test, y_pred))

Accuracy: 0.94
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2680
           1       0.96      0.90      0.93      1967

    accuracy                           0.94      4647
   macro avg       0.95      0.94      0.94      4647
weighted avg       0.95      0.94      0.94      4647



In [None]:
xgb_model = XGBClassifier(
    n_estimators=300,  # Number of boosting rounds
    learning_rate=0.01, # Step size shrinkage
    max_depth=10,       # Maximum depth of a tree
    random_state=2    # Random seed
)

# Train the model
xgb_model.fit(X_train, Y_train)

In [None]:
# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(Y_test, y_pred))

Accuracy: 0.89
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.87      0.90      5359
           1       0.84      0.92      0.88      3934

    accuracy                           0.89      9293
   macro avg       0.89      0.89      0.89      9293
weighted avg       0.89      0.89      0.89      9293

