In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
true = pd.read_csv('True.csv')

In [5]:
fake = pd.read_csv('Fake.csv')

In [7]:
true['Lable'] = 1
fake['lable'] = 0

In [9]:
true.head()

Unnamed: 0,title,text,subject,date,Lable
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [11]:
news = pd.concat([fake, true], axis = 0)

In [13]:
news.tail()

Unnamed: 0,title,text,subject,date,lable,Lable
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",,1.0
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",,1.0
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",,1.0
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",,1.0
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",,1.0


In [15]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 21416
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   title    44898 non-null  object 
 1   text     44898 non-null  object 
 2   subject  44898 non-null  object 
 3   date     44898 non-null  object 
 4   lable    23481 non-null  float64
 5   Lable    21417 non-null  float64
dtypes: float64(2), object(4)
memory usage: 2.4+ MB


In [17]:
news = news.drop(['title','subject','date','lable'],axis=1)

In [19]:
news.head()

Unnamed: 0,text,Lable
0,Donald Trump just couldn t wish all Americans ...,
1,House Intelligence Committee Chairman Devin Nu...,
2,"On Friday, it was revealed that former Milwauk...",
3,"On Christmas day, Donald Trump announced that ...",
4,Pope Francis used his annual Christmas Day mes...,


In [21]:
news = news.sample(frac=1)    #mixing the rows(Reshuffling)

In [23]:
news.reset_index(inplace = True)

In [25]:
news.head()

Unnamed: 0,index,text,Lable
0,1563,"WASHINGTON (Reuters) - Mitch McConnell, the U....",1.0
1,1815,There have been weeks of reports that White Ho...,
2,21249,This is a MUST watch from start to finish. The...,
3,15216,"BENGHAZI, Libya (Reuters) - Eastern Libyan for...",1.0
4,2743,CNN viewers will no longer be entertained by t...,


In [27]:
news.drop(['index'],axis = 1, inplace = True)

In [29]:
news.sample(15)

Unnamed: 0,text,Lable
20361,WASHINGTON (Reuters) - A Romanian hacker nickn...,1.0
41181,WASHINGTON (Reuters) - The White House will sa...,1.0
4829,WASHINGTON (Reuters) - U.S. Treasury Secretary...,1.0
36665,Reality show star turned president-elect Donal...,
33417,Donald Trump spent months on the campaign trai...,
26538,NEW YORK (Reuters) - New Jersey Governor Chris...,1.0
3940,"Things are not looking good for Donald Trump, ...",
40900,Rep. Markwayne Mullin (R-Okla.) held a town ha...,
39193,A Louisiana state representative introduced le...,
3396,Former U.S. Attorney Joseph diGenova slammed F...,


In [31]:
news['Lable'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  news['Lable'].fillna(0, inplace=True)


In [33]:
news

Unnamed: 0,text,Lable
0,"WASHINGTON (Reuters) - Mitch McConnell, the U....",1.0
1,There have been weeks of reports that White Ho...,0.0
2,This is a MUST watch from start to finish. The...,0.0
3,"BENGHAZI, Libya (Reuters) - Eastern Libyan for...",1.0
4,CNN viewers will no longer be entertained by t...,0.0
...,...,...
44893,William McGurn Wall Street JournalLet s get t...,0.0
44894,This is a big deal and should be investigated....,0.0
44895,Republican Congresswoman Marsha Blackburn (TN)...,0.0
44896,ANKARA (Reuters) - A spokesman for Turkish Pre...,1.0


In [35]:
def preprocessing(text):
    text = text.lower()    #converting to lowercase
    text = re.sub(r'https?://\S+|www\.\S+','',text)   #remove urls
    text = re.sub(r'<.*?>', '', text)  #Remove HTML tags 
    text = re.sub(r'[^\w\s]', '', text)  #Remove punctuation
    text = re.sub(r'\d','', text)  #Remove digits
    text = re.sub(r'\n','', text) #Remove newline characters
    return text


In [37]:
news['text']= news['text'].apply(preprocessing)

In [38]:
news['text']

0        washington reuters  mitch mcconnell the us sen...
1        there have been weeks of reports that white ho...
2        this is a must watch from start to finish ther...
3        benghazi libya reuters  eastern libyan forces ...
4        cnn viewers will no longer be entertained by t...
                               ...                        
44893     william mcgurn wall street journallet s get t...
44894    this is a big deal and should be investigated ...
44895    republican congresswoman marsha blackburn tn d...
44896    ankara reuters  a spokesman for turkish presid...
44897    budapest reuters  hungary is not planning to m...
Name: text, Length: 44898, dtype: object

In [39]:
X = news['text']
Y = news['Lable']

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3)

In [45]:
Y_train.info()

<class 'pandas.core.series.Series'>
Index: 31428 entries, 7109 to 31786
Series name: Lable
Non-Null Count  Dtype  
--------------  -----  
31428 non-null  float64
dtypes: float64(1)
memory usage: 491.1 KB


In [47]:
X_train.shape

(31428,)

In [49]:
vectorization = TfidfVectorizer()

In [51]:
XV_train = vectorization.fit_transform(X_train)

In [52]:
XV_train

<31428x174712 sparse matrix of type '<class 'numpy.float64'>'
	with 6447025 stored elements in Compressed Sparse Row format>

In [53]:
XV_test = vectorization.transform(X_test)

In [54]:
XV_test

<13470x174712 sparse matrix of type '<class 'numpy.float64'>'
	with 2733475 stored elements in Compressed Sparse Row format>

In [55]:
LR = LogisticRegression()

In [56]:
LR.fit(XV_train, Y_train)

In [57]:
prediction = LR.predict(XV_test)

In [58]:
score = LR.score(XV_test,Y_test)

In [59]:
score

0.9887156644394952

In [60]:
print(classification_report(Y_test,prediction))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      7000
         1.0       0.99      0.99      0.99      6470

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
DTC = DecisionTreeClassifier()

In [None]:
DTC.fit(XV_train, Y_train)

In [None]:
prediction_dtc = DTC.predict(XV_test)

In [None]:
DTC.score(XV_test, Y_test)

In [None]:
print(classification_report(Y_test, prediction_dtc))

In [97]:
def classif_nesw(n):
    if n ==0:
        return "It is a fake news"
    else:
        return "It is a good news"

In [105]:
def manual_testing (news):
    testing_news = {"text": [news]} # Corrected syntax for defining dictionary
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(preprocessing)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test) # Assuming 'vectorization' is your vectorizer object pred_1r = LR.predict(new_xv_test)
    # pred_dtc = dtc.predict(new_xv_test)
    return "\n\nLR Prediction: {}".format( classif_nesw (prediction[0]))

In [107]:
new_article = "Sources within the government have revealed a secret plan to replace all physical currency with digital chips embedded in citizens' bodies. The initiative is claimed to be part of a larger agenda to control and monitor financial transactions more closely. Experts have raised alarms about privacy violations and the potential for government overreach."

In [109]:
manual_testing(new_article)

'\n\nLR Prediction: It is a good news'