In [None]:
import pandas as pd
from google.colab import files

In [None]:
data = pd.read_csv('cleaned_news_articles.csv')

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
missing_values_rows = data.isnull().any(axis=1)
print("rows with missing values:")
print(missing_values_rows)

In [None]:
duplicate_rows = data[data.duplicated()]
print("duplicate rows:")
print(duplicate_rows)

In [None]:
data.dropna(axis=0, inplace = True)

In [None]:
missing_values_rows = data.isnull().any(axis=1)
print("rows with missing values:")
print(missing_values_rows)

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
duplicate_rows = data[data.duplicated()]
print("duplicate rows:")
print(duplicate_rows)

In [None]:
data.to_csv('cleaned_news_articles.csv', index=False)

In [None]:
data = pd.read_csv('cleaned_news_articles.csv')

In [None]:
data.head()

In [None]:
source_counts = data.groupby(["site_url","label"]).size().unstack(fill_value=0)


In [None]:
source_counts["Percentage Real (%)"] = (source_counts["Real"]/(source_counts["Real"]+source_counts["Fake"])) * 100
source_counts["Percentage Fake (%)"] = (source_counts["Fake"]/(source_counts["Real"]+source_counts["Fake"])) * 100

In [None]:
sorted_sources = source_counts.sort_values(by="Percentage Real (%)", ascending=False)

In [None]:
print("Top 10 Most Credible News Sources:")
for source, row in sorted_sources.head(10).iterrows():
  print(f"News {source}, fake news = {row['Percentage Fake (%)']:.1f}%")

print("Top 10 Least Credible News Sources:")
for source, row in sorted_sources.tail(10).iterrows():
  print(f"News {source}, fake news = {row['Percentage Fake (%)']:.1f}%")

In [None]:
import matplotlib.pyplot as plt

# Top 10 Most Credible News Sources
top_10_credible = sorted_sources.head(10)
# Top 10 Least Credible News Sources
top_10_least_credible = sorted_sources.tail(10)

# Grafik oluşturma
plt.figure(figsize=(12, 6))

# En güvenilir kaynaklar
plt.subplot(1, 2, 1)
plt.barh(top_10_credible.index, top_10_credible['Percentage Fake (%)'], color='green')
plt.xlabel('Sahte Haber Yüzdesi')
plt.title('En Güvenilir 10 Kaynak')

# En güvenilmez kaynaklar
plt.subplot(1, 2, 2)
plt.barh(top_10_least_credible.index, top_10_least_credible['Percentage Fake (%)'], color='red')
plt.xlabel('Sahte Haber Yüzdesi')
plt.title('En Az Güvenilir 10 Kaynak')

# Grafiklerin gösterilmesi
plt.tight_layout()
plt.show()

In [None]:
from google.colab import files
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

In [None]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')

In [None]:
from google.colab import files
import matplotlib.pyplot as plt

In [None]:
from google.colab import files
from scipy.stats import chi2_contingency

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
title_counter = Counter()
text_counter = Counter()

In [None]:
for index, row in data.iterrows():
  title_words = word_tokenize(row["title"])
  text_words = word_tokenize(row["text"])

  title_words = [word.lower() for word in title_words if word.isalpha() and word.lower() not in stop_words]
  text_words = [word.lower() for word in title_words if word.isalpha() and word.lower() not in stop_words]

  if row["label"] == "Fake":
    title_counter.update(title_words)
    text_counter.update(text_words)

In [None]:
top_keywords_title = title_counter.most_common(5)
top_keywords_text = text_counter.most_common(5)

In [None]:
print("Top 5 Keywords Associated with Fake News Titles:")
for keyword, count in top_keywords_title:
  print(f"{keyword}:{count} times")
print("Top 5 Keywords Associated with Fake News Texts:")
for keyword, count in top_keywords_text:
  print(f"{keyword}:{count} times")

In [None]:
data["title_length"] = data["title"].apply(len)
data["text_length"] = data["text"].apply(len)

In [None]:
real_news = data[data["label"]=="Real"]
fake_news = data[data["label"]=="Fake"]

In [None]:
avg_real_title_length = real_news["title_length"].mean()
avg_fake_title_length = fake_news["title_length"].mean()
avg_real_text_length = real_news["text_length"].mean()
avg_fake_text_length = fake_news["text_length"].mean()

In [None]:
print(f"Average Title Length for Real News: {avg_real_title_length:.2f} characters")
print(f"Average Title Length for Fake News: {avg_fake_title_length:.2f} characters")
print(f"Average Text Length for Real News: {avg_real_text_length:.2f} characters")
print(f"Average Text Length for Fake News: {avg_fake_text_length:.2f} characters")

In [None]:
labels = ["Real Title","Fake News","Real Text","Fake Text"]
lengths = [avg_real_title_length, avg_fake_title_length, avg_real_text_length,avg_fake_text_length ]

In [None]:
plt.figure(figsize=(10,6))
plt.bar(labels,lengths,color=["green","red","green","red"])
plt.title("Average Title & Text Lengths for Real & Fake News")
plt.ylabel("Average Length (characters)")
plt.xticks(rotation=45)
plt.show()

In [None]:
def detect_sensationalism(text):
    sensational_keywords = ["shocking", "outrageous", "unbelievable", "mind-blowing", "explosive"]

    for keyword in sensational_keywords:
        if re.search(r'\b' + keyword + r'\b', text, re.IGNORECASE):
            return True
    return False

In [None]:
import re

In [None]:
data["Sensationalism"] = data["text"].apply(detect_sensationalism)

In [None]:
contigency_table = pd.crosstab(data["Sensationalism"],data["label"])
print(contigency_table)

In [None]:
chi2,p,_,_ = chi2_contingency(contigency_table)

In [None]:
print(f"Chi-squared statistic: {chi2}")
print(f"P-value: {p}")

In [None]:
alpha = 0.05
if p < alpha:
  print("There is a significant association between sensationalism and credibility of the news")
else:
  print("There is not significant association between sensationalism and credibility of the news")

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download('vader_lexicon')

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
def analyze_sentiment(text):
  sentiment_score = analyzer.polarity_scores(text)
  if sentiment_score["compound"] >= 0.05:
    return "Positive"
  elif sentiment_score["compound"] <= -0.05:
    return "Negative"
  else:
    return "Neutral"

data["Sentiment"] = data["text"].apply(analyze_sentiment)

In [None]:
print(data[['text','Sentiment']].head())

In [None]:
from google.colab import files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
fake_news_data = data[data['label'] == "Fake"]
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(fake_news_data["text"])
word_frequencies = X.toarray().sum(axis=0)
feature_names = vectorizer.get_feature_names_out()
keywords = [feature_names[i] for i in word_frequencies.argsort()[-10:][::-1]]
print(keywords)

In [None]:
site_counts = data["site_url"].value_counts()
fake_site_counts = data[data["label"]=="Fake"]["site_url"].value_counts()
fake_news_percentage = fake_site_counts / site_counts

In [None]:
#NLP
def fakenewsprediction(title, news_source):
  title_contains_keyword = any(keyword in title.lower() for keyword in keywords)
  if news_source in fake_news_percentage:
    source_fake_percentage = fake_news_percentage[news_source]
  else:
    source_fake_percentage = 0.0

  if title_contains_keyword and source_fake_percentage > 0.5:
    return "Fake News"
  else:
    return "Real News"

In [None]:
text_input = "Breaking: election week is over"
source_input = "der-postillon.com"
prediction = fakenewsprediction(text_input,source_input)
print(f"Prediction: {prediction}")

In [None]:
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [None]:
#feature engineering
missing_data = data[["text","label"]].isnull().any(axis=1)
if missing_data.any():
  print("Missing Values Found in the Dataset. Handle Missing Data Before Proceeding")
else:
  le = LabelEncoder()
  data["label"] = le.fit_transform(data["label"])
  X = data["text"]
  y = data["label"]
  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
  text_feature_extraction = TfidfVectorizer(max_features=5000,stop_words="english")
  model = LogisticRegression()
  pipeline = Pipeline([
      ('tfidf',text_feature_extraction),
      ('model',model)
  ])
  pipeline.fit(X_train,y_train)
  y_pred = pipeline.predict(X_test)
  accuracy = accuracy_score(y_test,y_pred)
  print(f"Accuracy: {accuracy:.2f}")
  def fakenewsprediction(text):
    input_data = [text]
    prediction = pipeline.predict(input_data)
    if prediction[0] == 0:
      return "Real News"
    else:
      return "Fake News"

In [None]:
article_input = "Stocks rallied sharply after the Labor Department said nonfarm payrolls rose by 150,000 in October — 20,000 fewer than expected but a difference attributable pretty much completely to the auto strikes, which appear to be over."
prediction = fakenewsprediction(article_input)
print(f"Prediction: {prediction}")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Precision, Recall ve F1 Score hesaplama
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

In [None]:
from sklearn.model_selection import cross_val_score

# Çapraz doğrulama ile doğruluk
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')  # 5 katmanlı çapraz doğrulama
print(f"Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

In [None]:
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
X = data["title"]
y = data["label"]

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000,stop_words="english")
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_tfidf,y_encoded)

In [None]:
#random forest
def fakenewsprediction(title):
  title_tfidf = tfidf_vectorizer.transform([title])
  prediction = random_forest_classifier.predict(title_tfidf)
  predicted_label = label_encoder.inverse_transform(prediction)
  return predicted_label[0]

In [None]:
title_input = "Few reasons for optimism after Antony Blinken's diplomatic dash"
prediction = fakenewsprediction(title_input)
print(f"Prediction: {prediction}")

In [None]:
from google.colab import files
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X = data["title"]
y = data["label"]

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000,stop_words="english")
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf,y_encoded,test_size=0.2,random_state=42)

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train,y_train)

In [None]:
y_pred = random_forest_classifier.predict(X_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)

In [None]:
print("Confusion Matrix:")
print(cm)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Doğruluk (Accuracy)
accuracy = accuracy_score(y_test, y_pred)

# Precision, Recall ve F1 Score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Sonuçları yazdırma
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
pip install fairlearn

In [None]:
import pandas as pd
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from fairlearn.postprocessing import ThresholdOptimizer
from fairlearn.reductions import DemographicParity, EqualizedOdds

In [None]:
X = data["title"]
y = data["label"]
tfidf_vectorizer = TfidfVectorizer(max_features=5000,stop_words="english")
X_tfidf = tfidf_vectorizer.fit_transform(X)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf,y_encoded,test_size=0.2,random_state=42)
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train,y_train)
y_pred = random_forest_classifier.predict(X_test)

In [None]:
def demographic_parity_difference(y_true,y_pred):
  group1_indices = [i for i,y in enumerate(y_true) if y == 0]
  group2_indices = [i for i,y in enumerate(y_true) if y == 1]
  group1_positive_rate = sum(1 for i in group1_indices if y_pred[i] == 1)/len(group1_indices)
  group2_positive_rate = sum(1 for i in group2_indices if y_pred[i] == 1)/len(group2_indices)
  dp_diff = abs(group1_positive_rate - group2_positive_rate)
  return dp_diff

In [None]:
dp_diff = demographic_parity_difference(y_test,y_pred)
print(f"Demographic Parity Difference: {dp_diff:.4f}")

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

data = pd.read_csv('cleaned_news_articles.csv')

texts = data['text'].tolist()
labels = data['label'].tolist()

label_map = {label: idx for idx, label in enumerate(set(labels))}
labels = [label_map[label] for label in labels]

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)
loss_fn = torch.nn.CrossEntropyLoss()

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()
        correct_predictions += (logits.argmax(dim=1) == labels).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader), correct_predictions / len(data_loader.dataset)


In [None]:
def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            correct_predictions += (logits.argmax(dim=1) == labels).sum().item()

            all_preds.extend(logits.argmax(dim=1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    report = classification_report(all_labels, all_preds, target_names=label_map.keys())
    accuracy = accuracy_score(all_labels, all_preds)
    return total_loss / len(data_loader), correct_predictions / len(data_loader.dataset), report, accuracy

In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_loss, train_acc = train_epoch(model, train_loader, loss_fn, optimizer, device)
    print(f'Train loss: {train_loss}, accuracy: {train_acc}')

    val_loss, val_acc, val_report, val_accuracy = eval_model(model, val_loader, loss_fn, device)
    print(f'Validation loss: {val_loss}, accuracy: {val_acc}')
    print(f'Validation Accuracy: {val_accuracy}')
    print(val_report)

In [None]:
print("Testing the model on validation dataset...")
val_loss, val_acc, val_report, val_accuracy = eval_model(model, val_loader, loss_fn, device)
print(f"Test Accuracy: {val_accuracy}")
print(val_report)