In [None]:
import csv
import pandas as pd

In [None]:
truth = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')
truth.head() #Returns n rows, default first 5 rows

In [None]:
fake.head()    

In [None]:
truth['label'] = 1
fake['label'] = 0
truth.head()

In [None]:
news = pd.concat([truth, fake], axis = 0)        #axis zero is for concatenating in the rows wise
                                                 #Mixing both the fake and true news

news.head()

In [None]:
news.isnull().sum()

In [None]:
news = news.drop(['title', 'subject','date'], axis = 1)     #title, subject and date column is not required for the project
#drops the columns as mentioned (axis = 1 is for column wise)
news.head()

In [None]:
news = news.sample(frac = 1)    #reshuffling
#frac here is responsible for the percentage of data you want to reshuffle, frac = 1 is 100% data
news.head() 

In [None]:
news.reset_index(inplace =True)      #reset_index function resets the index
                                     #inplace = True,  Updates the news DataFrame in place instead of returning a new one.
                                     #for False, it would return a copy of the news DataFrame modified
news.head()

In [None]:
news.drop(['index'], axis = 1, inplace =True)      #The old data index was invalid now so we got rid of it
news.head()

In [None]:
#Converting into numerical data would be better since ML responds and processes numerical data better
#Convert them into numerical vectors, through this we can assign some weight to few important texts
#Making it code readable by the machine

In [None]:
import re
#The re module in Python is used for working with regular expressions. It helps you find, replace, and manipulate text patterns efficiently

In [None]:
def wordrop(text):
    #Making text clean, models don't work on raw test
    # Convert into lowercase
    text = text.lower()
    
    """We use patterns with re module. The pattern for url which has been used will be available in its documentation,
    memorizing is not necessary"""
    
    # Removes URLs
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    
    # Removes HTML tags
    text = re.sub(r'<.*?>', '', text)  

     # Removes digits
    text = re.sub(r'\d+', '', text) 

    # Removes punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Removes newline characters
    text = re.sub(r'\n+', ' ', text)

    return text

    
   

In [None]:
news['text'] = news['text'].apply(wordrop)           #Updating the text in the DataFrame

In [None]:
news['text']

In [None]:
x = news['text']
y = news['label']

In [None]:
x

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split     #function for splitting datasets into training and testing sets!

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)    #using 30% of data for testing, and 70% for training

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
#used for converting text into numerical values using the TF-IDF (Term Frequency-Inverse Document Frequency) technique.


In [None]:
vectorization = TfidfVectorizer()

In [None]:
xv_train = vectorization.fit_transform(x_train)

In [None]:
xv_test = vectorization.transform(x_test)

In [None]:
xv_train

In [None]:
xv_test


In [None]:
#Creating a ML model, algorithm over this data
#Logistic Regression,  a statistical model commonly used for classification tasks in machine learning
from sklearn.linear_model import LogisticRegression
#predicts categories, uses probability based approach

In [None]:
LR = LogisticRegression(solver = "saga")  #Creating instance, solver saga for handingling CSR format data
LR.fit(xv_train ,y_train)

In [None]:
pred_lr = LR.predict(xv_test)

In [None]:
LR.score(xv_test, y_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_lr))

In [None]:
from sklearn.tree import DecisionTreeClassifier
# It builds a tree-like structure to decide which category a given input belongs to.
DTC = DecisionTreeClassifier()
DTC.fit(xv_train, y_train)

In [None]:
pred_dtc = DTC.predict(xv_test)

In [None]:
DTC.score(xv_test, y_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_dtc))

In [None]:
from sklearn.ensemble import RandomForestClassifier
# combines multiple decision trees to improve accuracy and reduce overfitting. It works by creating a "forest" of decision trees and averaging their predictions.


In [None]:
rfc = RandomForestClassifier()
rfc.fit(xv_train, y_train)
predict_rfc = rfc.predict(xv_test)

In [None]:
rfc.score(xv_test, y_test)

In [None]:
print(classification_report(y_test,predict_rfc))

In [None]:
#another algorithm
from sklearn.ensemble import GradientBoostingClassifier
#ensemble learning technique that builds multiple weak models (typically decision trees) and boosts their performance by focusing on hard-to-classify examples.

gbc = GradientBoostingClassifier()
gbc.fit(xv_train, y_train)

In [None]:
preed_gbc = gbc_predict(xv_test)
gbc.score(xv_test, y_test)

In [None]:
print(classification_report(y_test,preed_gbc))

In [None]:
def output_label(n):
    if n == 0:
        return "It is a Fake News"
    elif n == 1:
        return "It is a Genuine News"

In [None]:
def manual_testing(news):
    testing_news = {"text": [news]}   
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordrop)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_lr = LR.predict(new_xv_test)
    pred_gbc = gbc.predict(new_xv_test)
    pred_rfc = rfc.predict(new_xv_test)
    return "\n\nLR Prediction: {}  \nGBC Prediction:  {}  \nRFC  Prediction:  {}".format(output_label(pred_lr[0]),  output_label(pred_gbc[0]), output_label(pred_rfc[0]))

In [None]:
news_article = str(input("Enter news article"))


In [None]:
manual_testing(news_article)