In [1]:
import numpy as np
import pandas as pd

In [10]:
true_data = pd.read_csv("True.csv")
fake_data = pd.read_csv("Fake.csv")
#true_data.head()
#false_data.head()

In [11]:
true_data['label'] = 1
fake_data['label'] = 0


In [79]:
news = pd.concat([fake_data, true_data], axis = 0) # row wise concatenation
#news.tail()

In [80]:
#news.isnull().sum()  #checking if there is any empty column

news = news.drop(['title', 'subject', 'date'], axis=1) # drop the columns which are of no use
#news.head()

In [81]:
news = news.sample(frac = 1) #reshuffling the data as the first half contains only fake and the other true
#news.head()  

# the index also gets reshuffled
news.reset_index(inplace = True)
#news.head()


In [82]:
# dropping index column
news.drop(['index'],axis = 1, inplace = True)
news.head()

Unnamed: 0,text,label
0,"Max Gracia, a 22-year-old resident of Orlando,...",0
1,This ad is fantastic and really nails the fail...,0
2,"According to an Associated Press source, Micha...",0
3,WASHINGTON (Reuters) - U.S. intelligence agen...,1
4,"When asked why they might have booed, he specu...",0


In [83]:
import re
from sklearn.model_selection import train_test_split

In [84]:
def wordopt(text):
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)

    # Replace newline characters with a space followed by a newline character
    text = re.sub(r'\n', ' \n', text)

    return text

In [85]:
news['text'] = news['text'].apply(wordopt)

In [86]:
news['text']

0        max gracia a yearold resident of orlando flori...
1        this ad is fantastic and really nails the fail...
2        according to an associated press source michae...
3         washington reuters  us intelligence agencies ...
4        when asked why they might have booed he specul...
                               ...                        
44893    washington dc   may day is the day for commies...
44894    plimpton  who starred in the goonies when she ...
44895    naypyitaw reuters  a myanmar staterun newspape...
44896    london reuters  iran s revolutionary guards ha...
44897    st century wire says does the american ideal o...
Name: text, Length: 44898, dtype: object

In [87]:
X = news['text']
Y = news['label']

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [89]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2)
#X_train.shape
#X_test.shape

vectorizer = TfidfVectorizer()  # used to add importance to each words based on its repition on positive or negative texts
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test) 
#print(X_train) # (row,word) value assigned
print(X_test) 


  (0, 186237)	0.027286257658288314
  (0, 185577)	0.03175565920762033
  (0, 184932)	0.0708794348130615
  (0, 184342)	0.03305205249097392
  (0, 184021)	0.018537990538467143
  (0, 183133)	0.01641061079051223
  (0, 182516)	0.011366405724816531
  (0, 182169)	0.023932086754744033
  (0, 182085)	0.03935634266853269
  (0, 181926)	0.08029235186892948
  (0, 181816)	0.015137501441525569
  (0, 181771)	0.012369298155925763
  (0, 181650)	0.012693985520434506
  (0, 181385)	0.011966459077442539
  (0, 181018)	0.017516714221792167
  (0, 179510)	0.043975567157883165
  (0, 178457)	0.0816909655280007
  (0, 176896)	0.02605319112405885
  (0, 176839)	0.016208395585938798
  (0, 175014)	0.02903592278522587
  (0, 174901)	0.03848254807443153
  (0, 174863)	0.012684694613119033
  (0, 174754)	0.021622629022321913
  (0, 174104)	0.015233299076023058
  (0, 172963)	0.0581037202066417
  :	:
  (8979, 21069)	0.05576267300393379
  (8979, 20913)	0.032658016716440065
  (8979, 19872)	0.05205437231497777
  (8979, 15816)	0.025347

In [90]:
from sklearn.linear_model import LogisticRegression


In [91]:
#Training the Machine Learning Model (Logistic Regression)
model = LogisticRegression(max_iter = 1000) # max 1000 time the model will go through data
model.fit(X_train, Y_train)

In [92]:
from sklearn.metrics import accuracy_score

In [93]:
#Model Evaluation
#accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_accuracy = accuracy_score(Y_train, X_train_prediction)
print("Accuracy: ", training_accuracy)

Accuracy:  0.9930675427362325


In [94]:
#accuracy score on the testing data
X_test_prediction = model.predict(X_test)
testing_accuracy = accuracy_score(Y_test, X_test_prediction)
print("Accuracy: ", testing_accuracy)  

Accuracy:  0.988641425389755


In [95]:
#making use of another ML model than logistic regression
# Decision tree Classifier
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier()

In [96]:
DTC.fit(X_train, Y_train)

In [97]:
#accuracy score on the testing data
X_test_prediction = DTC.predict(X_test)
testing_accuracy = accuracy_score(Y_test, X_test_prediction)
print("Accuracy: ", testing_accuracy)  

Accuracy:  0.9948775055679288


In [98]:
# making use of random forest classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()



In [99]:
rfc.fit(X_train, Y_train)

In [101]:
X_test_prediction = rfc.predict(X_test)
testing_accuracy = accuracy_score(Y_test,  X_test_prediction)
print("Accuracy: ", testing_accuracy)

Accuracy:  0.9902004454342984


In [102]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()

In [103]:
gbc.fit(X_train, Y_train)

In [105]:
X_test_prediction = gbc.predict(X_test)
testing_accuracy = accuracy_score(Y_test,  X_test_prediction)
print("Accuracy: ", testing_accuracy)

Accuracy:  0.9953229398663697


In [106]:
def output_label(n):
    if n==0:
        return "It is a Fake News"
    elif n==1:
        return "It is a Genuine News"

In [117]:
def manual_testing(news):
    # Step 1: Create a dictionary with a single key 'text' and value 'news'
    testing_news = {"text": [news]}
    
    # Step 2: Convert the dictionary into a DataFrame
    new_def_test = pd.DataFrame(testing_news)
    
    # Step 3: Apply the wordopt function to preprocess the 'text' column
    new_def_test["text"] = new_def_test["text"].apply(wordopt)

    # Step 4: Extract the preprocessed 'text' column as 'new_x_test'
    new_x_test = new_def_test["text"]

    # Step 5: Transform the preprocessed text into a vector using a vectorizer object (assuming 'vectorizer' is your vectorizer object)

    new_xv_test = vectorizer.transform(new_x_test)

    # Step 6: Predict the labels for the preprocessed text using a logistic regression model (assuming 'LR' is your logistic regression model)
    pred_lr = model.predict(new_xv_test)

    # Step 7: Predict the labels for the preprocessed text using a random forest classifier model (assuming 'rfc' is your random forest classifier model)
    pred_rfc = rfc.predict(new_xv_test)

    # Step 8: Return the predictions as a formatted string
    return "\n\nLR Prediction: {} \nRFC Prediction: {}".format(output_label(pred_lr[0]), output_label(pred_rfc[0]))


In [118]:
news_article = str(input())

 Share a certain post of Bill Gates on Facebook and he will send you money. "Hey Facebook, As some of you may know, I'm Bill Gates. If you click that share link, I will give you $5,000. I always deliver, I mean, I brought you Windows XP, right?"


In [119]:
manual_testing(news_article)

'\n\nLR Prediction: It is a Fake News \nRFC Prediction: It is a Fake News'