# Naïve Bayes Classifier
### (a) Loading data and extracting features
First, we need to load the data and extract features. We will use CountVectorizer to convert text data into numerical features.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

def load_data():
    real_path = '/content/real.txt'
    fake_path = '/content/fake.txt'

    with open(real_path, 'r', encoding='utf-8') as f:
        real_headlines = f.readlines()
    with open(fake_path, 'r', encoding='utf-8') as f:
        fake_headlines = f.readlines()

    headlines = real_headlines + fake_headlines
    labels = [1] * len(real_headlines) + [0] * len(fake_headlines)

    return headlines, labels



def extract_features(headlines):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(headlines)
    return X, vectorizer


headlines, labels = load_data()
X, vectorizer = extract_features(headlines)


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

### (b) Filtering meaningless words
We can filter out words that appear in more than 70% of documents or less than 0.5% of documents by setting the max_df and min_df parameters.

In [7]:
def filter_words(vectorizer, X):
    vectorizer.max_df = 0.7
    vectorizer.min_df = 0.005
    X_filtered = vectorizer.fit_transform(headlines)
    return X_filtered, vectorizer

X_filtered, vectorizer = filter_words(vectorizer, X)
X_train, X_test, y_train, y_test = train_test_split(X_filtered, labels, test_size=0.3, random_state=42)

### (c) Implementing the Naive Bayes Classifier
Next, we implement the Naive Bayes classifier and calculate the accuracy on the test set. We will also generate a confusion matrix.

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np


def naive_bayes(X_train, X_test, y_train, y_test):

    model = MultinomialNB()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    cm = confusion_matrix(y_test, y_pred)

    return accuracy, cm

accuracy, cm = naive_bayes(X_train, X_test, y_train, y_test)
print(f"Test Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{cm}")

Test Accuracy: 0.789795918367347
Confusion Matrix:
[[287  95]
 [111 487]]
