# **Fake news detection**

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Loading the dataset
news=pd.read_csv('newspapers.csv')

In [3]:
news.shape

(72134, 4)

In [4]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


**Data pre-proccessing**

In [5]:
# replacing the null values in dataset empty string
new = news.fillna('')

In [6]:
#we will be working on titles of the news as reading entire article is too much data
news['input']=news['title']
print(news['input'])

0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1                                                      NaN
2        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3        Bobby Jindal, raised Hindu, uses story of Chri...
4        SATAN 2: Russia unvelis an image of its terrif...
                               ...                        
72129    Russians steal research on Trump in hack of U....
72130     WATCH: Giuliani Demands That Democrats Apolog...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    Trump tussle gives unpopular Mexican leader mu...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: input, Length: 72134, dtype: object


In [7]:
# seperating data and label
X = news.drop(columns='label',axis=1)
Y = news['label']

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Vamsi
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [10]:
# Stemming:converting word to its root word
port_stem = PorterStemmer()

In [11]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [12]:
news['input'] = news['input'].fillna('')
news['input'] = news['input'].apply(stemming)

In [13]:
print(news['input'])

0        law enforc high alert follow threat cop white ...
1                                                         
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rais hindu use stori christian co...
4        satan russia unv imag terrifi new supernuk wes...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131         migrant refus leav train refuge camp hungari
72132    trump tussl give unpopular mexican leader much...
72133           goldman sach endors hillari clinton presid
Name: input, Length: 72134, dtype: object


In [14]:
X = news['input'].values
Y = news['label'].values

In [15]:
print(X)

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video'
 ''
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video'
 ... 'migrant refus leav train refuge camp hungari'
 'trump tussl give unpopular mexican leader much need shot arm'
 'goldman sach endors hillari clinton presid']


In [16]:
print(Y)

[1 1 1 ... 0 0 1]


In [17]:
Y.shape

(72134,)

In [18]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [19]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 629110 stored elements and shape (72134, 19639)>
  Coords	Values
  (0, 407)	0.3190180925014663
  (0, 1802)	0.33473541566384035
  (0, 3679)	0.24871262252022117
  (0, 5509)	0.31820565801047196
  (0, 6425)	0.28932771754845743
  (0, 6730)	0.48553136502134386
  (0, 7887)	0.26746434949988324
  (0, 9699)	0.22829788917209384
  (0, 17260)	0.24871262252022117
  (0, 17363)	0.2542650376115143
  (0, 18648)	0.1297506867782943
  (0, 19106)	0.19134939529376566
  (2, 1049)	0.28404017886581956
  (2, 2673)	0.30809679188606154
  (2, 2919)	0.3639616996972358
  (2, 6880)	0.2652283770602196
  (2, 8020)	0.2692285294185893
  (2, 11864)	0.2231406266784195
  (2, 12011)	0.16878852994653004
  (2, 12744)	0.27904818164471595
  (2, 13591)	0.22687620695463123
  (2, 14591)	0.3580030298678158
  (2, 15094)	0.1609967301122813
  (2, 16446)	0.1999703023632961
  (2, 18034)	0.35962437110547785
  :	:
  (72130, 17778)	0.13227219506940732
  (72130, 18936)	0.2530249939

__Splitting data into train and test data__


In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

__Classification(logistc regression)__

In [21]:
model = LogisticRegression()

In [22]:
model.fit(X_train, Y_train)

Evaluation

In [23]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [24]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9193858630668723


In [25]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [26]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.900603035974215


__Making a predictive system__

In [27]:
X_new = X_test[3]

prediction = model.predict(X_new)
print('Prediction:',prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')
print('Actual answer:',Y_test[3])

Prediction: [0]
The news is Real
Actual answer: 0


__Theoretical implementation__

In [28]:
#defining the sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [29]:
value = 0
print(f"sigmoid({value}) = {sigmoid(value)}")

sigmoid(0) = 0.5


In [30]:
# vectorised cost function
def compute_cost(X, y, w, b):
    m = X.shape[0]
    z = X.dot(w) + b
    y_hat = sigmoid(z)
    cost = - (y * np.log(y_hat + 1e-15) + (1 - y) * np.log(1 - y_hat + 1e-15))
    return np.mean(cost)

In [31]:
# Vectorized gradient
def compute_gradient(X, y, w, b):
    m = X.shape[0]
    z = X.dot(w) + b
    y_hat = sigmoid(z)
    error = y_hat - y
    dj_dw = (X.T @ error) / m
    dj_db = np.mean(error)
    return dj_db, dj_dw


In [32]:
m,n = X_train.shape
initial_w = np.zeros(n)
initial_b = 0.
cost = compute_cost(X_train, Y_train, initial_w, initial_b)
print('Cost at initial w and b (zeros): {:.3f}'.format(cost))

Cost at initial w and b (zeros): 0.693


In [33]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters):
    cost_history = []
    for i in range(num_iters):
        dj_db, dj_dw = gradient_function(X, y, w_in, b_in)
        w_in -= alpha * dj_dw
        b_in -= alpha * dj_db
        if i % max(1, num_iters // 10) == 0 or i == num_iters - 1:
            cost = cost_function(X, y, w_in, b_in)
            cost_history.append(cost)
    return w_in, b_in, cost_history


In [34]:
# Prediction function
def predict(X, w, b):
    return (sigmoid(X.dot(w) + b) >= 0.5).astype(int)

__Model Training__

In [35]:
np.random.seed(1)
n = X_train.shape[1]
initial_w = 0.01 * (np.random.rand(n) - 0.5)
initial_b = -8

iterations = 10000
alpha = 0.05

w, b, cost_history = gradient_descent(X_train, Y_train, initial_w, initial_b, compute_cost, compute_gradient, alpha, iterations)


In [36]:
# Accuracy
train_acc = accuracy_score(Y_train, predict(X_train, w, b))
test_acc = accuracy_score(Y_test, predict(X_test, w, b))

print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)
print("Final Cost:", cost_history[-1])

Train Accuracy: 0.8347687455594642
Test Accuracy: 0.8310112982602066
Final Cost: 0.5520524273095686


In [37]:
# Choose a sample index
index = 10
sample = X[index]
sample_pred = predict(sample, w, b)

# Print the prediction result
print(f"Prediction for X[{index}]:", "The news is fake" if sample_pred[0] == 1 else "The news is real")
print(f"Actual label: {'Fake' if Y[index] == 1 else 'Real'}")

Prediction for X[10]: The news is fake
Actual label: Fake


In [59]:
# using other models

__Decision tree__

In [57]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    max_depth=40,
    random_state=1
)
dt_model.fit(X_train, Y_train)
train_preds_dt = dt_model.predict(X_train)
test_preds_dt = dt_model.predict(X_test)

# Accuracy
print("Decision Tree Train Accuracy:", accuracy_score(Y_train, train_preds_dt))
print("Decision Tree Test Accuracy:", accuracy_score(Y_test, test_preds_dt))


Decision Tree Train Accuracy: 0.8395688564645537
Decision Tree Test Accuracy: 0.8034241353018645


__XG Boost__

In [41]:
from xgboost import XGBClassifier

# Train XGBoost
xgb_model = XGBClassifier(eval_metric='logloss', random_state=0)
xgb_model.fit(X_train, Y_train)

# Predict
train_preds_xgb = xgb_model.predict(X_train)
test_preds_xgb = xgb_model.predict(X_test)

# Accuracy
print("XGBoost Train Accuracy:", accuracy_score(Y_train, train_preds_xgb))
print("XGBoost Test Accuracy:", accuracy_score(Y_test, test_preds_xgb))


XGBoost Train Accuracy: 0.8911570520040897
XGBoost Test Accuracy: 0.8694115200665419


__Random forest__

In [58]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(
    n_estimators=100,  
    max_depth=25,       
    random_state=1,
    n_jobs=-1
)
rf_model.fit(X_train, Y_train)
train_preds_rf = rf_model.predict(X_train)
test_preds_rf = rf_model.predict(X_test)

train_accuracy_rf= accuracy_score(Y_train, train_preds_rf)
test_accuracy_rf= accuracy_score(Y_test, test_preds_rf)
print("Random Forest Train Accuracy:", train_accuracy_rf)
print("Random Forest Test Accuracy:", test_accuracy_rf)


Random Forest Train Accuracy: 0.8478867381773442
Random Forest Test Accuracy: 0.8311499272197962
