In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
true_df = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
false_df = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")

In [None]:
true_df.head()

In [None]:
false_df.head()

In [None]:
true_df['label']= 1
false_df['label'] = 0

In [None]:
df = pd.concat([true_df,false_df]).reset_index(drop= True)
print(df.head())

In [None]:
df.sample(100)

In [None]:
# EDA
print(df.shape, df.columns)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(x=df['label'])
plt.title("Counter Pot of Fake vs Real News")
plt.show()

In [None]:
print(df["label"].value_counts(normalize=True))


In [None]:
print(df.isnull().sum())



In [None]:
from wordcloud import WordCloud

real_text = " ".join(df[df["label"]==1]["text"])
fake_text = " ".join(df[df["label"]==0]["text"])

wc_real = WordCloud(width=800, height=400, background_color="white").generate(real_text)
wc_fake = WordCloud(width=800, height=400, background_color="white").generate(fake_text)

plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.imshow(wc_real, interpolation="bilinear")
plt.title("Real News WordCloud")
plt.axis("off")

plt.subplot(1,2,2)
plt.imshow(wc_fake, interpolation="bilinear")
plt.title("Fake News WordCloud")
plt.axis("off")
plt.show()


In [None]:
# Preprocessing

In [None]:
df["content"] = df["title"].fillna("") + " " + df["text"].fillna("")

In [None]:
df.head()

In [None]:
import nltk
import spacy
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'a-z\s', " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["Cleaned_text"] = df['content'].apply(preprocess_text)

In [None]:
df.head()

In [None]:
df= df.drop(columns = ['title', 'text', 'content'])


In [None]:
df.head()

In [None]:
df.drop(columns=['date'], inplace=True)


In [None]:
df.head()

In [None]:
df['subject'].unique()

In [None]:
def clean_text(text):
    text = text.lower()                         # lowercase
    text = re.sub(r'[^a-z\s]', '', text)        # remove punctuation, numbers, symbols
    words = text.split()                        # tokenize
    words = [w for w in words if w not in stop_words]   # remove stopwords
    words = [lemmatizer.lemmatize(w) for w in words]    # lemmatization
    return " ".join(words)

df["text"] = df["Cleaned_text"].apply(clean_text)

In [None]:
df.head()

In [None]:
df = df.drop(columns=['Cleaned_text'])

In [None]:
df.head()

In [None]:
df = df.drop(columns=["subject"])

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
#logistic regresssion

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=200)
lr.fit(X_train_vec, y_train)


In [None]:
# svm

from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_vec, y_train)


In [None]:
# Evaluation

from sklearn.metrics import accuracy_score, f1_score, classification_report

# Logistic Regression
y_pred_lr = lr.predict(X_test_vec)
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# SVM
y_pred_svm = svm.predict(X_test_vec)
print("\nSVM")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))
d

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

fake_text = " ".join(df[df["label"]==0]["text"])
real_text = " ".join(df[df["label"]==1]["text"])

plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.imshow(WordCloud(width=400, height=400).generate(fake_text))
plt.title("Fake News")
plt.axis("off")

plt.subplot(1,2,2)
plt.imshow(WordCloud(width=400, height=400).generate(real_text))
plt.title("Real News")
plt.axis("off")
plt.show()
