In [10]:
import pandas as pd
import sklearn as sk
import nltk
nltk.download('punkt')                  ## Pre-trained tokenizer data (used by word_tokenize)
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\sachi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sachi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
df_fake = pd.read_csv(r"Fake.csv")
df_true = pd.read_csv(r"True.csv")


In [12]:
## Merge the two dataframes and add a label column
# where 0 indicates fake news and 1 indicates true news.

df_fake['label'] = 0
df_true['label'] = 1
data = pd.concat([df_fake, df_true], ignore_index=True)


In [13]:
data.head()
print(data.shape)
print(data.columns)


(44898, 5)
Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')


In [14]:
## merge the columns 'title' and 'text' into a new column called 'content' so it can useful for better prediction
data['content'] = data['title'] + ' ' + data['text']

In [15]:
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sachi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\sachi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sachi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')                  ## Pre-trained tokenizer data (used by word_tokenize)
nltk.download('stopwords')              ## the list of stopwords in english
stop_words = set(stopwords.words('english'))
def clean_text(text):
    token= word_tokenize(text.lower())
    token= [word for word in token if word.isalpha()]         ## removes everything that is not a letter
    token= [word for word in token if word not in stop_words] ## removes stopwords
    return ' '.join(token)                                    ## adds all the words with a space in between
data['content'] = data['content'].apply(clean_text)


[nltk_data] Downloading package punkt to C:\Users\sachi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sachi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer  ##Term Frequency - Inverse Document Frequency
##turn text into numbers
vectorizer = TfidfVectorizer(max_df=0.7)  ##This ignores words that appear in more than 70% of the documents
x= vectorizer.fit_transform(data['content'])  ##Fit the model and transform the data into vectors
y = data['label']  ##Labels for the data, 0 for fake news and 1 for true news


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [23]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [24]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4670   63]
 [  75 4172]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4733
           1       0.99      0.98      0.98      4247

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



In [25]:
def predict_news_headline(headline):
   
    cleaned = clean_text(headline)
    
    
    vectorized = vectorizer.transform([cleaned])
    
    
    prediction = model.predict(vectorized)

    
    if prediction[0] == 1:
        print("This news headline is likely **REAL**.")
    else:
        print("This news headline is likely **FAKE**.")


In [31]:
input_headline = input("Enter a news headline to predict: ")
predict_news_headline(input_headline)

This news headline is likely **REAL**.


Government announces new plan to fight inflation
