In [None]:
# Step 1: Import libraries and load data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Load TF-IDF features and labels
tfidf_features = pickle.load(open('../data/processed/tfidf_features.pkl', 'rb'))
labels_df = pd.read_csv('../data/labels.csv')
history_df = pd.read_csv('../data/processed/cleaned_history.csv')

# Ensure alignment (assuming tfidf_features shape is (75, 254))
X = tfidf_features
y_true = labels_df['label'].values

# Load the trained model
with open('../models/logistic_regression.pkl', 'rb') as file:
    model = pickle.load(file)

In [None]:
# Step 2: Test the Model on Existing Data
y_pred = model.predict(X)

# Evaluate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
print('Classification Report:')
print(classification_report(y_true, y_pred, target_names=['Democrat', 'Republican']))

In [None]:
# Step 3: Visualize Label Distribution and Prediction Confidence
import matplotlib.pyplot as plt

# Label distribution (true labels)
plt.figure(figsize=(8, 5))
plt.hist(y_true, bins=2, edgecolor='black', color='#1E90FF')
plt.title('Distribution of True Labels')
plt.xlabel('Label (0: Democrat, 1: Republican)')
plt.ylabel('Count')
plt.xticks([0, 1])
plt.show()

# Prediction confidence (probability estimates)
y_prob = model.predict_proba(X)[:, 1]  # Probability of class 1 (Republican)
plt.figure(figsize=(8, 5))
plt.hist(y_prob, bins=10, edgecolor='black', color='#FF4500')
plt.title('Prediction Confidence for Republican Label')
plt.xlabel('Probability')
plt.ylabel('Count')
plt.show()

In [None]:
# Step 4: Iterate - Adjust Labels or Features
# Example: Refine labeling logic based on misclassifications
misclassified = (y_pred != y_true)
misclassified_df = history_df[misclassified].copy()
misclassified_df['predicted_label'] = y_pred[misclassified]
misclassified_df['true_label'] = y_true[misclassified]
print('Misclassified Entries:')
print(misclassified_df[['cleaned_title', 'predicted_label', 'true_label']])

# Suggest refining keywords based on misclassifications
if len(misclassified_df) > 0:
    print('\nConsider adding these keywords to generate_labels.py:')
    for title in misclassified_df['cleaned_title']:
        if misclassified_df.loc[misclassified_df['cleaned_title'] == title, 'true_label'].iloc[0] == 1:
            print(f"- Potential Republican keywords in '{title}'")
        else:
            print(f"- Potential Democrat keywords in '{title}'")


In [None]:
# Step 5: Predict User Voting Preference

# Calculate the dominant predicted label
dominant_label = np.bincount(y_pred).argmax()
voting_preference = 'left' if dominant_label == 0 else 'right'
confidence_percentage = (np.bincount(y_pred)[dominant_label] / len(y_pred)) * 100

# Output the verdict
print(f"Voting Preference Prediction - As of 07:47 PM +0545, July 20, 2025")
print(f"--------------------------------------------------")
print(f"- **Predicted Preference**: {voting_preference} (likely {voting_preference} wing)")
print(f"- **Confidence**: {confidence_percentage:.1f}% (based on {len(y_pred)} entries)")
print(f"- **Insight**: This prediction is derived from the majority of web history entries leaning toward {'Democrat' if dominant_label == 0 else 'Republican'} content.")
