In [1]:
import pymc as pm
import numpy as np
import scipy.sparse as sp

# Load the Bag of Words (BoW) matrix (already sparse)
X_text = sp.load_npz('C:/Users/Sanah Iftikhar/Downloads/archive/X_text.npz')

# Define the number of clusters or components for the MBL model
n_clusters = 100  # Adjust based on your needs

# Convert sparse matrix to dense (for further manipulation if needed)
X_text_dense = X_text.toarray()

# Get dimensions of the feature matrix (NX: number of samples, DX: number of features)
NX, DX = X_text_dense.shape
print(f"Feature matrix dimensions: {NX} rows, {DX} columns")



Feature matrix dimensions: 38658 rows, 15000 columns


In [12]:
# Compute feature sums
feature_sums = np.sum(X_text_dense, axis=0)

# Apply a threshold to remove low-frequency features
threshold = 10**2  # Increase the threshold
high_freq_features = feature_sums > threshold

# Filter the feature matrix
X_text_preprocessed = X_text_dense[:, high_freq_features]

# Get updated dimensions after filtering
NX, DX_filtered = X_text_preprocessed.shape
print(f"Filtered feature matrix dimensions: {NX} rows, {DX_filtered} columns")


Filtered feature matrix dimensions: 38658 rows, 9127 columns


In [13]:
# Save the dense array
np.save('C:/Users/Sanah Iftikhar/Downloads/archive/X_text_preprocessed.npy', X_text_preprocessed)

In [16]:
import pandas as pd

# Load the combined DataFrame from the CSV file
news = pd.read_csv(r'C:\Users\Sanah Iftikhar\Downloads\archive\news_combined.csv')

# Check the DataFrame to ensure it's loaded correctly
print(news.head())

                                                Text  label
0  Donald Trump just couldn t wish all Americans ...      0
1  House Intelligence Committee Chairman Devin Nu...      0
2  On Friday, it was revealed that former Milwauk...      0
3  On Christmas day, Donald Trump announced that ...      0
4  Pope Francis used his annual Christmas Day mes...      0


In [17]:
# Define the number of clusters (or categories: fake = 0, true = 1)
K = 2  # Since we have 2 classes (fake and true)

# Assign labels (0 for fake, 1 for true)
data_id = news['label'].values  # Use the 'label' column directly

# Your preprocessed feature matrix
data = np.load('X_text_preprocessed.npy')  #

In [18]:
#REMOVABLE!
# Count the occurrences of each label in the 'label' column
label_counts = news['label'].value_counts()

# Print the counts
print(label_counts)

label
1    21192
0    17466
Name: count, dtype: int64


In [19]:
#
import numpy as np

def polya_moment_match(data):
    # Step 1: Calculate the sums of the features
    feature_sums = np.sum(data, axis=0)  # Sum each feature across all samples
    
    # Step 2: Apply MLE directly by adding a small constant
    alpha_params = feature_sums + 1e-5  # Small constant to avoid zero division

    return alpha_params

In [20]:
def MSDMinitial(data, data_id, K):
    NX, DX = data.shape
    num_in_class = np.zeros(K)
    alpha0 = np.zeros((K, DX))
    
    # K-means clustering or predefined labels
    for j in range(K):
        idtmp = np.where(data_id == j)[0]  # Get indices for class j
        num_in_class[j] = len(idtmp)
        data_c = data[idtmp, :]
        alpha0[j, :] = polya_moment_match(data_c)  # Compute alpha parameters using MLE
    
    beta0 = np.ones((K, DX))  # Initialize beta parameters
    Priors = num_in_class / NX  # Calculate priors

    return Priors, alpha0, beta0, num_in_class

In [21]:
#initial model fitting
Priors, alpha0, beta0, num_in_class = MSDMinitial(data, data_id, K)

In [23]:
import numpy as np
from scipy.special import gammaln

def msdpdfln(X, alpha, beta):
    """
    Multinomial Scaled Dirichlet log probability density function (pdf)
    
    Parameters:
    X (ndarray): n-by-d count matrix (your data)
    alpha (ndarray): k-by-d parameter values (alpha parameters)
    beta (ndarray): k-by-d parameter values (beta parameters)
    
    Returns:
    logl (ndarray): n-by-1 log probability densities for each row of X
    """
    n = np.sum(X, axis=1)  # Sum along rows
    alpha_rowsums = np.sum(alpha, axis=1) if alpha.ndim > 1 else np.sum(alpha)

    logl = gammaln(n + 1) - np.sum(gammaln(X + 1), axis=1)
    logl += np.sum(gammaln(X + alpha), axis=1) - np.sum(gammaln(alpha), axis=1)
    logl -= np.sum(X * np.log(beta), axis=1)
    logl += gammaln(alpha_rowsums) - gammaln(alpha_rowsums + n)

    return logl
