In [40]:
import spacy
import matplotlib.pyplot as plt
import seaborn as sn
nlp = spacy.load("en_core_web_lg")


In [41]:
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
df = pd.read_csv("C:/Users/User/Desktop/code files/NLP/news detection/news.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [42]:
df.shape

(6335, 4)

In [43]:
df["new_label"] = df.label.map({
    "FAKE":0,
    "REAL": 1
})

In [44]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label,new_label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,1
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL,1
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE,0
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE,0
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL,1


In [45]:
df.shape

(6335, 5)

In [46]:
df.drop(["Unnamed: 0"], axis=1)

Unnamed: 0,title,text,label,new_label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,1
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,1
...,...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL,1
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE,0
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE,0
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL,1


Build a model with no text preprocessing

In [47]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.text,
                                                    df.new_label,
                                                    test_size=0.2)

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
model = Pipeline([
    ("tdif", TfidfVectorizer()),
    ("log", LogisticRegression())
])

In [49]:
model.fit(x_train,y_train)

In [50]:
x_train

2242    Taking Social Security benefits early comes wi...
3573    The election in 232 photos, 43 numbers and 131...
153     Email This Week in the News \nYou wouldn’t kno...
5155    An old saying asserts that falsehoods come in ...
2860    Share on Facebook \nRepublican Donald Trump ha...
                              ...                        
4164    How Rich Candidates Try To Appeal To Working V...
5409    Let’s Be Clear – A Vote For Warmonger Hillary ...
4437    The nuclear deal that the United States and fi...
43      Homeless Woman Protects Trump’s Walk of Fame S...
5462    Brussels, Belgium (CNN) Police detained six pe...
Name: text, Length: 5068, dtype: object

In [51]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93       655
           1       0.93      0.92      0.92       612

    accuracy                           0.93      1267
   macro avg       0.93      0.92      0.92      1267
weighted avg       0.93      0.93      0.93      1267



In [52]:
from sklearn.naive_bayes import MultinomialNB


clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Multi NB', MultinomialNB())         
])

clf.fit(x_train, y_train)


y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.61      0.75       655
           1       0.70      0.99      0.82       612

    accuracy                           0.79      1267
   macro avg       0.84      0.80      0.79      1267
weighted avg       0.85      0.79      0.79      1267



Model with text preprocessing

Use text pre-processing to remove stop words, punctuations and apply lemmatization </h3>

In [53]:
def preprocess_text(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [54]:
df['preprocessed_text'] = df.text.apply(preprocess_text)

In [None]:
x_tr, x_t, y_tr, y_t = train_test_split(df.preprocessed_text,
                                        df.new_label,
                                        test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
model2 = Pipeline([
    ("tdif", TfidfVectorizer()),
    ("log", LogisticRegression())
])

model2.fit(x_tr, y_tr)


In [None]:
y_pred2 = model2.predict(x_t)
print(classification_report(y_t, y_pred2))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91       619
           1       0.95      0.88      0.91       648

    accuracy                           0.91      1267
   macro avg       0.92      0.91      0.91      1267
weighted avg       0.92      0.91      0.91      1267



In [None]:
from sklearn.naive_bayes import MultinomialNB


clf2 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Multi NB', MultinomialNB())         
])

clf2.fit(x_tr, y_tr)


y_pred3 = clf2.predict(x_t)

print(classification_report(y_t
                            , y_pred3))

              precision    recall  f1-score   support

           0       0.98      0.76      0.86       619
           1       0.81      0.99      0.89       648

    accuracy                           0.88      1267
   macro avg       0.90      0.87      0.87      1267
weighted avg       0.89      0.88      0.87      1267



In [None]:
import joblib

joblib.dump(model2,'model.joblib')

['model.joblib']

In [None]:
re = model.predict(["Bola Ahmed Tinubu won the election that was carried out in tghe year 2023"])
if re == 0:
    print("fake")
else:
    print("real")

fake
