In [None]:
# step4_nlp.py
"""
Evaluates Naive Bayes Text Classifier on blogs.csv and performs evaluation artifacts.
"""

In [None]:
import os, json, re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, confusion_matrix
)

In [None]:
# -------- File Paths --------
INPUT_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\19 naive bayes and text mining\blogs.csv"
OUTPUT_PATH = os.path.dirname(INPUT_PATH)
os.makedirs(OUTPUT_PATH, exist_ok=True)

In [None]:
# -------- Preprocessing --------
def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
stopwords = set(ENGLISH_STOP_WORDS)
def remove_stopwords(t: str) -> str:
    return " ".join([w for w in t.split() if w not in stopwords])

In [None]:
df = pd.read_csv(INPUT_PATH)
text_col = "Data" if "Data" in df.columns else df.columns[0]
label_col = "Labels" if "Labels" in df.columns else df.columns[1]
df = df.dropna(subset=[text_col, label_col]).reset_index(drop=True)
df["clean"] = df[text_col].apply(clean_text).apply(remove_stopwords)

In [None]:
le = LabelEncoder()
y = le.fit_transform(df[label_col])
classes = list(le.classes_)  # <— used later

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean"], y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# -------- Baseline NB --------
vec = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, sublinear_tf=True)
X_train_vec = vec.fit_transform(X_train)
X_test_vec  = vec.transform(X_test)

In [None]:
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train_vec, y_train)
y_pred = nb.predict(X_test_vec)

In [None]:
acc = accuracy_score(y_test, y_pred)
prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)
prec_weight, rec_weight, f1_weight, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=0)

In [None]:
print(f"Baseline Accuracy: {acc:.4f}")
print(f"Macro F1: {f1_macro:.4f} | Weighted F1: {f1_weight:.4f}")

In [None]:
# -------- GridSearch Tuning --------
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, min_df=2, sublinear_tf=True)),
    ("clf", MultinomialNB())
])
params = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "clf__alpha": [0.1, 0.5, 1.0]
}
gs = GridSearchCV(pipeline, params, cv=4, n_jobs=-1, scoring="accuracy")
gs.fit(X_train, y_train)
y_pred_gs = gs.predict(X_test)

In [None]:
acc_gs = accuracy_score(y_test, y_pred_gs)
f1_macro_gs = precision_recall_fscore_support(y_test, y_pred_gs, average="macro", zero_division=0)[2]
f1_w_gs     = precision_recall_fscore_support(y_test, y_pred_gs, average="weighted", zero_division=0)[2]

In [None]:
print(f"Tuned Accuracy: {acc_gs:.4f} | Macro F1: {f1_macro_gs:.4f}")
print("Best Params:", gs.best_params_)

In [None]:
# -------- Save Results --------
summary = {
    "baseline": {"accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weight},
    "tuned":    {"accuracy": acc_gs, "f1_macro": f1_macro_gs, "f1_weighted": f1_w_gs, "best_params": gs.best_params_}
}
with open(os.path.join(OUTPUT_PATH, "nb_results.json"), "w") as f:
    json.dump(summary, f, indent=2)
print("Results saved to nb_results.json")

In [None]:
# ----------------------------
# Visualization and Comparison
# ----------------------------
import matplotlib.pyplot as plt

In [None]:
# Confusion matrix from the **tuned** predictions (use baseline if you prefer)
cm = confusion_matrix(y_test, y_pred_gs)

In [None]:
# --- Confusion Matrix Plot ---
plt.figure(figsize=(10, 8))
plt.imshow(cm, interpolation="nearest", cmap="Blues")
plt.title("Confusion Matrix — Naive Bayes")
plt.colorbar()
plt.xticks(range(len(classes)), classes, rotation=90)
plt.yticks(range(len(classes)), classes)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
confusion_path = os.path.join(OUTPUT_PATH, "nb_confusion_matrix.png")
plt.savefig(confusion_path, dpi=300)
plt.close()
print(f"Saved confusion matrix plot to: {confusion_path}")

In [None]:
# Also save CSV version of the confusion matrix (handy for the appendix)
pd.DataFrame(cm, index=classes, columns=classes).to_csv(
    os.path.join(OUTPUT_PATH, "nb_confusion_matrix.csv"), index=True
)

In [None]:
# --- Baseline vs Tuned Comparison ---
baseline_metrics = {"accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weight}
tuned_metrics    = {"accuracy": acc_gs, "f1_macro": f1_macro_gs, "f1_weighted": f1_w_gs}

In [None]:
comp_df = pd.DataFrame([
    {"model": "baseline", **baseline_metrics},
    {"model": "tuned",    **tuned_metrics}
])

In [None]:
comparison_path = os.path.join(OUTPUT_PATH, "baseline_vs_tuned_metrics.csv")
comp_df.to_csv(comparison_path, index=False)
print(f"Saved metric comparison to: {comparison_path}")
print("\nComparison Table:\n", comp_df)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion matrix — Naive Bayes")
plt.colorbar()
plt.xticks(range(len(classes)), classes, rotation=90)
plt.yticks(range(len(classes)), classes)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig("nb_confusion_matrix.png")
plt.close()

In [None]:
# baseline_metrics and tuned_metrics are dicts like the 'summary' above
comp_df = pd.DataFrame([
    {"model":"baseline", **baseline_metrics},
    {"model":"tuned", **tuned_metrics}
])
comp_df.to_csv("baseline_vs_tuned_metrics.csv", index=False)

In [None]:
# naive_bayes_text_mining.py
"""
Text classification using Multinomial Naive Bayes + sentiment analysis (VADER)
Adapt this INPUT_PATH if needed.
Saves outputs to the same folder as INPUT_PATH.
"""

In [None]:
import os
import re
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

In [None]:
# Sentiment (VADER)
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [None]:
# -----------------------
# Config - change this path if needed
# -----------------------
INPUT_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\19 naive bayes and text mining\blogs.csv"
OUTPUT_FOLDER = os.path.dirname(INPUT_PATH)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [None]:
# TF-IDF / model settings
MAX_FEATURES = 5000
RANDOM_STATE = 42
TEST_SIZE = 0.2
GRID = {
    # small grid to tune key hyperparameters quickly
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__alpha": [0.1, 0.5, 1.0]
}
CV = 4

In [None]:
# -----------------------
# Helpers
# -----------------------
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"\S+@\S+", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\b\d+\b", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
def remove_stopwords(text: str):
    tokens = text.split()
    tokens = [t for t in tokens if t not in ENGLISH_STOP_WORDS]
    return " ".join(tokens)

In [None]:
def save_confusion_matrix(cm, labels, path_png, title="Confusion matrix"):
    plt.figure(figsize=(10,8))
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.yticks(range(len(labels)), labels)
    plt.ylabel("True")
    plt.xlabel("Predicted")
    plt.tight_layout()
    plt.savefig(path_png)
    plt.close()

In [None]:
# -----------------------
# Load & preprocess
# -----------------------
print("Loading:", INPUT_PATH)
df = pd.read_csv(INPUT_PATH)

In [None]:
# detect columns (flexible)
text_col = None
label_col = None
for c in ["Data","Text","data","text","Content"]:
    if c in df.columns:
        text_col = c
        break
for c in ["Labels","Label","labels","label","Category","category"]:
    if c in df.columns:
        label_col = c
        break
if text_col is None or label_col is None:
    if len(df.columns) >= 2:
        text_col, label_col = df.columns[0], df.columns[1]
    else:
        raise ValueError("Could not find text/label columns in CSV.")

In [None]:
df = df[[text_col, label_col]].rename(columns={text_col: "Data", label_col: "Labels"})
df = df.dropna(subset=["Data", "Labels"]).reset_index(drop=True)
print("Rows after dropna:", len(df))
print("Labels distribution:\n", df["Labels"].value_counts().head(20))

In [None]:
# Clean text
df["clean_text"] = df["Data"].astype(str).apply(clean_text)
df["clean_text_nostop"] = df["clean_text"].apply(remove_stopwords)
df["clean_len_words"] = df["clean_text_nostop"].apply(lambda t: len(t.split()))

In [None]:
# Save processed CSV
processed_csv = os.path.join(OUTPUT_FOLDER, "blogs_processed_naivebayes.csv")
df.to_csv(processed_csv, index=False)
print("Saved processed CSV:", processed_csv)

In [None]:
# -----------------------
# TF-IDF + train/test
# -----------------------
vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2), min_df=2, sublinear_tf=True)
X = vectorizer.fit_transform(df["clean_text_nostop"].fillna(""))
joblib.dump(vectorizer, os.path.join(OUTPUT_FOLDER, "tfidf_vectorizer.joblib"))
print("TF-IDF shape:", X.shape)

In [None]:
le = LabelEncoder()
y = le.fit_transform(df["Labels"])
joblib.dump(le, os.path.join(OUTPUT_FOLDER, "label_encoder.joblib"))
print("Classes:", list(le.classes_))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
print("Train/test sizes:", X_train.shape, X_test.shape)

In [None]:
# -----------------------
# Baseline MultinomialNB
# -----------------------
print("\nTraining baseline MultinomialNB...")
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
y_prob = nb.predict_proba(X_test) if hasattr(nb, "predict_proba") else None

In [None]:
acc = accuracy_score(y_test, y_pred)
prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)
print(f"Baseline accuracy: {acc:.4f}, f1_macro: {f1_macro:.4f}")

In [None]:
# Save baseline model
joblib.dump(nb, os.path.join(OUTPUT_FOLDER, "nb_baseline.joblib"))

In [None]:
# Save baseline metrics & reports
base_report = classification_report(y_test, y_pred, target_names=le.classes_, digits=4)
with open(os.path.join(OUTPUT_FOLDER, "baseline_classification_report.txt"), "w") as f:
    f.write(base_report)
pd.DataFrame(confusion_matrix(y_test, y_pred), index=le.classes_, columns=le.classes_).to_csv(os.path.join(OUTPUT_FOLDER, "baseline_confusion_matrix.csv"))
save_confusion_matrix(confusion_matrix(y_test, y_pred), le.classes_, os.path.join(OUTPUT_FOLDER, "baseline_confusion_matrix.png"))

In [None]:
# Save baseline predictions
pred_df = pd.DataFrame({
    "text": df.loc[X_test.indices if hasattr(X_test, 'indices') else X_test.tolist(), "Data"].values if False else df.iloc[X_test.nonzero()[0]]["Data"].values,  # placeholder not used
})
# Better approach: map test indices
test_idx = X_test.nonzero()[0] if hasattr(X_test, "nonzero") else None
# We'll use index-based split to save predictions accurately:
_, X_test_idx = train_test_split(df.index, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
predictions_df = pd.DataFrame({
    "true_label": le.inverse_transform(y_test),
    "pred_label": le.inverse_transform(y_pred),
    "pred_confidence": y_prob.max(axis=1) if y_prob is not None else None,
    "text": df.loc[X_test_idx, "Data"].values
})
predictions_df.to_csv(os.path.join(OUTPUT_FOLDER, "baseline_predictions.csv"), index=False)

In [None]:
# -----------------------
# Quick GridSearch (pipeline) to tune alpha + ngram_range
# -----------------------
print("\nStarting small GridSearch over alpha / ngram_range...")
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=MAX_FEATURES, min_df=2, sublinear_tf=True)),
    ("clf", MultinomialNB())
])
grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__alpha": [0.1, 0.5, 1.0]
}
gs = GridSearchCV(pipeline, grid, cv=CV, n_jobs=-1, verbose=1, scoring="accuracy")
# Fit on raw cleaned text (pipeline will vectorize)
gs.fit(df.loc[:, "clean_text_nostop"], y)
print("GridSearch best:", gs.best_params_, "best_score:", gs.best_score_)

In [None]:
# Evaluate best estimator on held-out test set
best_model = gs.best_estimator_
y_pred_gs = best_model.predict(df.loc[X_test_idx, "clean_text_nostop"])
acc_gs = accuracy_score(y_test, y_pred_gs)
print(f"Tuned model accuracy on test set: {acc_gs:.4f}")

In [None]:
# Save tuned pipeline
joblib.dump(best_model, os.path.join(OUTPUT_FOLDER, "nb_tuned_pipeline.joblib"))

In [None]:
# Save tuned reports
with open(os.path.join(OUTPUT_FOLDER, "tuned_classification_report.txt"), "w") as f:
    f.write(classification_report(y_test, y_pred_gs, target_names=le.classes_, digits=4))
pd.DataFrame(confusion_matrix(y_test, y_pred_gs), index=le.classes_, columns=le.classes_).to_csv(os.path.join(OUTPUT_FOLDER, "tuned_confusion_matrix.csv"))
save_confusion_matrix(confusion_matrix(y_test, y_pred_gs), le.classes_, os.path.join(OUTPUT_FOLDER, "tuned_confusion_matrix.png"))

In [None]:
# -----------------------
# Sentiment analysis using VADER
# -----------------------
print("\nRunning VADER sentiment analysis...")
try:
    nltk.data.find("sentiment/vader_lexicon.zip")
except LookupError:
    nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

In [None]:
# compute sentiment scores
sent_scores = df["Data"].astype(str).apply(lambda t: sia.polarity_scores(t)["compound"])
def sentiment_label(c):
    if c >= 0.05:
        return "positive"
    elif c <= -0.05:
        return "negative"
    else:
        return "neutral"
df["sentiment_score"] = sent_scores
df["sentiment_label"] = df["sentiment_score"].apply(sentiment_label)

In [None]:
# Save CSV with sentiment + predictions (merge predictions_df on text)
# Attach baseline predictions where possible by index:
# We already saved predictions_df for the test subset; let's save overall sentiment + labels for full data.
df.to_csv(os.path.join(OUTPUT_FOLDER, "blogs_with_sentiment.csv"), index=False)
print("Saved sentiment-annotated CSV to:", os.path.join(OUTPUT_FOLDER, "blogs_with_sentiment.csv"))

In [None]:
# -----------------------
# Summary JSON for assignment
# -----------------------
summary = {
    "n_documents": int(len(df)),
    "n_classes": int(len(le.classes_)),
    "classes": list(le.classes_),
    "baseline_accuracy": float(acc),
    "tuned_grid_best": gs.best_params_,
    "tuned_cv_score": float(gs.best_score_),
    "tuned_test_accuracy": float(acc_gs)
}
with open(os.path.join(OUTPUT_FOLDER, "nb_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)

In [None]:
print("\nAll done. Outputs saved to:", OUTPUT_FOLDER)
print("Key files:")
print(" - baseline_classification_report.txt")
print(" - baseline_confusion_matrix.csv/png")
print(" - baseline_predictions.csv")
print(" - nb_baseline.joblib")
print(" - nb_tuned_pipeline.joblib (GridSearch best)")
print(" - tuned_classification_report.txt")
print(" - blogs_with_sentiment.csv")
print(" - nb_summary.json")

In [None]:
# nb_train.py
"""
Naive Bayes text classifier (Task 2)
- Change INPUT_PATH if needed.
- Saves outputs (model, vectorizer, reports) to the same folder as INPUT_PATH.
Requirements:
    pip install numpy pandas scikit-learn matplotlib joblib
"""

In [None]:
import os
import re
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_recall_fscore_support
)

In [None]:
# -------- CONFIG --------
INPUT_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\19 naive bayes and text mining\blogs.csv"
OUTPUT_FOLDER = os.path.dirname(INPUT_PATH)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [None]:
RANDOM_STATE = 42
TEST_SIZE = 0.20
MAX_FEATURES = 5000   # change if you want fewer/more features
NGRAM_RANGE = (1,2)   # unigrams + bigrams
MIN_DF = 2

In [None]:
# -------- helpers --------
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text.lower()
    t = re.sub(r"http\S+|www\.\S+", " ", t)        # remove urls
    t = re.sub(r"\S+@\S+", " ", t)                # remove emails
    t = re.sub(r"[^a-z0-9\s]", " ", t)            # remove punctuation
    t = re.sub(r"\b\d+\b", " ", t)                # remove standalone digits
    t = re.sub(r"\s+", " ", t).strip()            # collapse spaces
    return t

In [None]:
def remove_stopwords(text: str) -> str:
    tokens = text.split()
    kept = [t for t in tokens if t not in ENGLISH_STOP_WORDS]
    return " ".join(kept)

In [None]:
def save_confusion_matrix(cm, labels, png_path, title="Confusion matrix"):
    plt.figure(figsize=(10,8))
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.yticks(range(len(labels)), labels)
    plt.ylabel("True")
    plt.xlabel("Predicted")
    plt.tight_layout()
    plt.savefig(png_path)
    plt.close()

In [None]:
# -------- main --------
def main():
    # load dataset
    if not os.path.exists(INPUT_PATH):
        raise FileNotFoundError(f"Input file not found: {INPUT_PATH}")
    print("Loading:", INPUT_PATH)
    df = pd.read_csv(INPUT_PATH)

    # detect likely columns
    text_col = None
    label_col = None
    for c in ["Data","Text","data","text","Content","content"]:
        if c in df.columns:
            text_col = c
            break
    for c in ["Labels","Label","labels","label","Category","category"]:
        if c in df.columns:
            label_col = c
            break
    if text_col is None or label_col is None:
        if len(df.columns) >= 2:
            text_col, label_col = df.columns[0], df.columns[1]
        else:
            raise ValueError("Couldn't auto-detect text/label columns in CSV. Ensure it has two columns.")

    df = df[[text_col, label_col]].rename(columns={text_col: "Data", label_col: "Labels"})
    df = df.dropna(subset=["Data", "Labels"]).reset_index(drop=True)
    print(f"Rows after dropna: {len(df)}")
    print("Label distribution (top 10):\n", df["Labels"].value_counts().head(10).to_string())

    # Preprocess text (clean + remove stopwords)
    print("Cleaning text (lowercase, remove urls/emails/punct, drop stopwords)...")
    df["clean"] = df["Data"].astype(str).apply(clean_text).apply(remove_stopwords)
    df["clean_len"] = df["clean"].apply(lambda t: len(t.split()))

    # Encode labels
    le = LabelEncoder()
    y = le.fit_transform(df["Labels"])
    classes = list(le.classes_)
    print("Classes detected:", classes)

    # Train-test split (stratified)
    X_train_text, X_test_text, y_train, y_test = train_test_split(
        df["clean"], y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    print("Train size:", len(X_train_text), "Test size:", len(X_test_text))

    # Vectorize: fit TF-IDF on train only
    print("Fitting TF-IDF on training data...")
    vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=NGRAM_RANGE, min_df=MIN_DF, sublinear_tf=True)
    X_train = vectorizer.fit_transform(X_train_text)
    X_test = vectorizer.transform(X_test_text)
    print("TF-IDF shapes:", X_train.shape, X_test.shape)

    # Save vectorizer
    vec_path = os.path.join(OUTPUT_FOLDER, "tfidf_vectorizer.joblib")
    joblib.dump(vectorizer, vec_path)
    joblib.dump(le, os.path.join(OUTPUT_FOLDER, "label_encoder.joblib"))
    print("Saved vectorizer and label encoder to output folder.")

    # Train Multinomial Naive Bayes
    print("Training MultinomialNB...")
    nb = MultinomialNB()
    nb.fit(X_train, y_train)

    # Predict on test
    y_pred = nb.predict(X_test)
    y_prob = nb.predict_proba(X_test) if hasattr(nb, "predict_proba") else None

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)
    prec_weight, rec_weight, f1_weight, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=0)

    print(f"\nTest Accuracy: {acc:.4f}")
    print(f"Macro F1: {f1_macro:.4f} | Weighted F1: {f1_weight:.4f}")

    # Classification report & confusion matrix
    report = classification_report(y_test, y_pred, target_names=classes, digits=4)
    cm = confusion_matrix(y_test, y_pred)

    # Save artifacts
    with open(os.path.join(OUTPUT_FOLDER, "nb_classification_report.txt"), "w") as f:
        f.write("Test Accuracy: {:.6f}\n\n".format(acc))
        f.write(report)
    pd.DataFrame(cm, index=classes, columns=classes).to_csv(os.path.join(OUTPUT_FOLDER, "nb_confusion_matrix.csv"))
    save_confusion_matrix(cm, classes, os.path.join(OUTPUT_FOLDER, "nb_confusion_matrix.png"))

    # Save model & predictions
    joblib.dump(nb, os.path.join(OUTPUT_FOLDER, "nb_model.joblib"))
    # Build predictions dataframe aligned to test split
    test_indices = X_test_text.index if hasattr(X_test_text, "index") else None
    preds_df = pd.DataFrame({
        "text": X_test_text.values,
        "true_label": le.inverse_transform(y_test),
        "pred_label": le.inverse_transform(y_pred),
        "pred_confidence": (y_prob.max(axis=1) if y_prob is not None else None)
    })
    # The above "text" may be an ndarray of strings; ensure correct alignment using iloc on dataframe
    # Let's get indices used in the split to be safe:
    # We recreate by mapping values (not perfect if duplicates), but better approach is using .iloc indexes:
    # Simpler: re-run split with return of indices - but to avoid overcomplicating, save predictions by re-applying vectorizer to original X_test_text
    # Save final preds using X_test_text series
    preds_df = pd.DataFrame({
        "text": X_test_text.reset_index(drop=True),
        "true_label": le.inverse_transform(y_test),
        "pred_label": le.inverse_transform(y_pred),
        "pred_confidence": (y_prob.max(axis=1) if y_prob is not None else None)
    })
    preds_df.to_csv(os.path.join(OUTPUT_FOLDER, "nb_test_predictions.csv"), index=False)

    # Summary JSON
    summary = {
        "n_documents": int(len(df)),
        "n_classes": int(len(classes)),
        "classes": classes,
        "test_size": int(len(X_test_text)),
        "accuracy": float(acc),
        "precision_macro": float(prec_macro),
        "recall_macro": float(rec_macro),
        "f1_macro": float(f1_macro),
        "precision_weighted": float(prec_weight),
        "recall_weighted": float(rec_weight),
        "f1_weighted": float(f1_weight),
    }
    with open(os.path.join(OUTPUT_FOLDER, "nb_summary.json"), "w") as f:
        json.dump(summary, f, indent=2)

    print("\nSaved outputs to:", OUTPUT_FOLDER)
    print(" - nb_model.joblib")
    print(" - tfidf_vectorizer.joblib")
    print(" - nb_classification_report.txt")
    print(" - nb_confusion_matrix.csv/png")
    print(" - nb_test_predictions.csv")
    print(" - nb_summary.json")

In [None]:
if __name__ == "__main__":
    main()

In [None]:
# naive_bayes_text_mining.py
"""
Text classification using Multinomial Naive Bayes + sentiment analysis (VADER)
Adapt this INPUT_PATH if needed.
Saves outputs to the same folder as INPUT_PATH.
"""

In [None]:
import os
import re
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

In [None]:
# Sentiment (VADER)
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [None]:
# -----------------------
# Config - change this path if needed
# -----------------------
INPUT_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\19 naive bayes and text mining\blogs.csv"
OUTPUT_FOLDER = os.path.dirname(INPUT_PATH)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [None]:
# TF-IDF / model settings
MAX_FEATURES = 5000
RANDOM_STATE = 42
TEST_SIZE = 0.2
GRID = {
    # small grid to tune key hyperparameters quickly
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__alpha": [0.1, 0.5, 1.0]
}
CV = 4

In [None]:
# -----------------------
# Helpers
# -----------------------
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"\S+@\S+", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\b\d+\b", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
def remove_stopwords(text: str):
    tokens = text.split()
    tokens = [t for t in tokens if t not in ENGLISH_STOP_WORDS]
    return " ".join(tokens)

In [None]:
def save_confusion_matrix(cm, labels, path_png, title="Confusion matrix"):
    plt.figure(figsize=(10,8))
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.yticks(range(len(labels)), labels)
    plt.ylabel("True")
    plt.xlabel("Predicted")
    plt.tight_layout()
    plt.savefig(path_png)
    plt.close()

In [None]:
# -----------------------
# Load & preprocess
# -----------------------
print("Loading:", INPUT_PATH)
df = pd.read_csv(INPUT_PATH)

In [None]:
# detect columns (flexible)
text_col = None
label_col = None
for c in ["Data","Text","data","text","Content"]:
    if c in df.columns:
        text_col = c
        break
for c in ["Labels","Label","labels","label","Category","category"]:
    if c in df.columns:
        label_col = c
        break
if text_col is None or label_col is None:
    if len(df.columns) >= 2:
        text_col, label_col = df.columns[0], df.columns[1]
    else:
        raise ValueError("Could not find text/label columns in CSV.")

In [None]:
df = df[[text_col, label_col]].rename(columns={text_col: "Data", label_col: "Labels"})
df = df.dropna(subset=["Data", "Labels"]).reset_index(drop=True)
print("Rows after dropna:", len(df))
print("Labels distribution:\n", df["Labels"].value_counts().head(20))

In [None]:
# Clean text
df["clean_text"] = df["Data"].astype(str).apply(clean_text)
df["clean_text_nostop"] = df["clean_text"].apply(remove_stopwords)
df["clean_len_words"] = df["clean_text_nostop"].apply(lambda t: len(t.split()))

In [None]:
# Save processed CSV
processed_csv = os.path.join(OUTPUT_FOLDER, "blogs_processed_naivebayes.csv")
df.to_csv(processed_csv, index=False)
print("Saved processed CSV:", processed_csv)

In [None]:
# -----------------------
# TF-IDF + train/test
# -----------------------
vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2), min_df=2, sublinear_tf=True)
X = vectorizer.fit_transform(df["clean_text_nostop"].fillna(""))
joblib.dump(vectorizer, os.path.join(OUTPUT_FOLDER, "tfidf_vectorizer.joblib"))
print("TF-IDF shape:", X.shape)

In [None]:
le = LabelEncoder()
y = le.fit_transform(df["Labels"])
joblib.dump(le, os.path.join(OUTPUT_FOLDER, "label_encoder.joblib"))
print("Classes:", list(le.classes_))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
print("Train/test sizes:", X_train.shape, X_test.shape)

In [None]:
# -----------------------
# Baseline MultinomialNB
# -----------------------
print("\nTraining baseline MultinomialNB...")
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
y_prob = nb.predict_proba(X_test) if hasattr(nb, "predict_proba") else None

In [None]:
acc = accuracy_score(y_test, y_pred)
prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)
print(f"Baseline accuracy: {acc:.4f}, f1_macro: {f1_macro:.4f}")

In [None]:
# Save baseline model
joblib.dump(nb, os.path.join(OUTPUT_FOLDER, "nb_baseline.joblib"))

In [None]:
# Save baseline metrics & reports
base_report = classification_report(y_test, y_pred, target_names=le.classes_, digits=4)
with open(os.path.join(OUTPUT_FOLDER, "baseline_classification_report.txt"), "w") as f:
    f.write(base_report)
pd.DataFrame(confusion_matrix(y_test, y_pred), index=le.classes_, columns=le.classes_).to_csv(os.path.join(OUTPUT_FOLDER, "baseline_confusion_matrix.csv"))
save_confusion_matrix(confusion_matrix(y_test, y_pred), le.classes_, os.path.join(OUTPUT_FOLDER, "baseline_confusion_matrix.png"))

In [None]:
# Save baseline predictions
pred_df = pd.DataFrame({
    "text": df.loc[X_test.indices if hasattr(X_test, 'indices') else X_test.tolist(), "Data"].values if False else df.iloc[X_test.nonzero()[0]]["Data"].values,  # placeholder not used
})
# Better approach: map test indices
test_idx = X_test.nonzero()[0] if hasattr(X_test, "nonzero") else None
# We'll use index-based split to save predictions accurately:
_, X_test_idx = train_test_split(df.index, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
predictions_df = pd.DataFrame({
    "true_label": le.inverse_transform(y_test),
    "pred_label": le.inverse_transform(y_pred),
    "pred_confidence": y_prob.max(axis=1) if y_prob is not None else None,
    "text": df.loc[X_test_idx, "Data"].values
})
predictions_df.to_csv(os.path.join(OUTPUT_FOLDER, "baseline_predictions.csv"), index=False)

In [None]:
# -----------------------
# Quick GridSearch (pipeline) to tune alpha + ngram_range
# -----------------------
print("\nStarting small GridSearch over alpha / ngram_range...")
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=MAX_FEATURES, min_df=2, sublinear_tf=True)),
    ("clf", MultinomialNB())
])
grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__alpha": [0.1, 0.5, 1.0]
}
gs = GridSearchCV(pipeline, grid, cv=CV, n_jobs=-1, verbose=1, scoring="accuracy")
# Fit on raw cleaned text (pipeline will vectorize)
gs.fit(df.loc[:, "clean_text_nostop"], y)
print("GridSearch best:", gs.best_params_, "best_score:", gs.best_score_)

In [None]:
# Evaluate best estimator on held-out test set
best_model = gs.best_estimator_
y_pred_gs = best_model.predict(df.loc[X_test_idx, "clean_text_nostop"])
acc_gs = accuracy_score(y_test, y_pred_gs)
print(f"Tuned model accuracy on test set: {acc_gs:.4f}")

In [None]:
# Save tuned pipeline
joblib.dump(best_model, os.path.join(OUTPUT_FOLDER, "nb_tuned_pipeline.joblib"))

In [None]:
# Save tuned reports
with open(os.path.join(OUTPUT_FOLDER, "tuned_classification_report.txt"), "w") as f:
    f.write(classification_report(y_test, y_pred_gs, target_names=le.classes_, digits=4))
pd.DataFrame(confusion_matrix(y_test, y_pred_gs), index=le.classes_, columns=le.classes_).to_csv(os.path.join(OUTPUT_FOLDER, "tuned_confusion_matrix.csv"))
save_confusion_matrix(confusion_matrix(y_test, y_pred_gs), le.classes_, os.path.join(OUTPUT_FOLDER, "tuned_confusion_matrix.png"))

In [None]:
# -----------------------
# Sentiment analysis using VADER
# -----------------------
print("\nRunning VADER sentiment analysis...")
try:
    nltk.data.find("sentiment/vader_lexicon.zip")
except LookupError:
    nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

In [None]:
# compute sentiment scores
sent_scores = df["Data"].astype(str).apply(lambda t: sia.polarity_scores(t)["compound"])
def sentiment_label(c):
    if c >= 0.05:
        return "positive"
    elif c <= -0.05:
        return "negative"
    else:
        return "neutral"
df["sentiment_score"] = sent_scores
df["sentiment_label"] = df["sentiment_score"].apply(sentiment_label)

In [None]:
# Save CSV with sentiment + predictions (merge predictions_df on text)
# Attach baseline predictions where possible by index:
# We already saved predictions_df for the test subset; let's save overall sentiment + labels for full data.
df.to_csv(os.path.join(OUTPUT_FOLDER, "blogs_with_sentiment.csv"), index=False)
print("Saved sentiment-annotated CSV to:", os.path.join(OUTPUT_FOLDER, "blogs_with_sentiment.csv"))

In [None]:
# -----------------------
# Summary JSON for assignment
# -----------------------
summary = {
    "n_documents": int(len(df)),
    "n_classes": int(len(le.classes_)),
    "classes": list(le.classes_),
    "baseline_accuracy": float(acc),
    "tuned_grid_best": gs.best_params_,
    "tuned_cv_score": float(gs.best_score_),
    "tuned_test_accuracy": float(acc_gs)
}
with open(os.path.join(OUTPUT_FOLDER, "nb_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)

In [None]:
print("\nAll done. Outputs saved to:", OUTPUT_FOLDER)
print("Key files:")
print(" - baseline_classification_report.txt")
print(" - baseline_confusion_matrix.csv/png")
print(" - baseline_predictions.csv")
print(" - nb_baseline.joblib")
print(" - nb_tuned_pipeline.joblib (GridSearch best)")
print(" - tuned_classification_report.txt")
print(" - blogs_with_sentiment.csv")
print(" - nb_summary.json")