# Bibliotekos

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.chunk.named_entity import shape
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from collections import Counter
import spacy
from sklearn.ensemble import StackingClassifier

# Failo skaitymas

In [None]:

# Perskaitome failƒÖ
df = pd.read_csv("spam.csv", encoding="latin-1")

df = df.iloc[:, :2]  # pasirenkam tik pirmus du stulpelius
df.columns = ['label', 'text']  # pervadinam, jei reikia
df['label'] = df['label'].str.lower()

print(df.info())


In [None]:
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

print(df.head())

In [None]:
df['word_count'] = df['text'].apply(lambda x: len([w for w in str(x).split() if w.isalpha()]))


df['char_count'] = df['text'].apply(lambda x: len(str(x).replace(" ", "")))

def plot_histograms(df, suffix=""):
    plt.figure(figsize=(10,5))
    sns.histplot(df, x='word_count', hue='label', bins=30, kde=False, palette={0:'skyblue',1:'orange'})
    plt.xlabel("≈Ωod≈æi≈≥ skaiƒçius")
    plt.ylabel("≈Ωinuƒçi≈≥ skaiƒçius")
    plt.title(f"≈Ωinuƒçi≈≥ ilgiai (≈æod≈æiai) {suffix}")
    plt.show()

    plt.figure(figsize=(10,5))
    sns.histplot(df, x='char_count', hue='label', bins=30, kde=False, palette={0:'skyblue',1:'orange'})
    plt.xlabel("Simboli≈≥ skaiƒçius (be tarp≈≥)")
    plt.ylabel("≈Ωinuƒçi≈≥ skaiƒçius")
    plt.title(f"≈Ωinuƒçi≈≥ ilgiai (simboliai) {suffix}")
    plt.show()

plot_histograms(df, "(original)")

from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

def plot_wordcloud(df, label, title):
    text = " ".join(df[df['label'] == label]['text'])

    word_freq = Counter(text.split())

    wc = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap='Reds' if label == 1 else 'Blues'
    ).generate_from_frequencies(word_freq)

    plt.figure(figsize=(10, 6))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.show()


plot_wordcloud(df, 1, "Da≈æniausi ≈æod≈æiai SPAM")
plot_wordcloud(df, 0, "Da≈æniausi ≈æod≈æiai HAM")

def get_top_words(df, label, n=20):
    words = " ".join(df[df['label']==label]['text']).split()
    counter = Counter(words)
    return counter.most_common(n)

top_spam = get_top_words(df, 1)
top_ham = get_top_words(df, 0)
print("Top SPAM:", top_spam)
print("Top HAM:", top_ham)

In [None]:
# Sukuriame savo stop_words sƒÖra≈°ƒÖ
my_stop_words = {
    'the', 'a', 'an', 'and', 'or', 'in', 'on', 'of', 'to', 'for', 'is', 'are',
    'was', 'were', 'be', 'been', 'it', 'this', 'that', 'with', 'as', 'by', 'at',
    'from', 'about', 'into', 'up', 'out', 'so', 'if', 'then', 'but', 'you', 'your', 'have',
    'u'
}

def remove_stopwords(text):
    words = str(text).lower().split()
    filtered = [w for w in words if w not in my_stop_words]
    return " ".join(filtered)

# Sukuriame kopijƒÖ ir pritaikome funkcijƒÖ
df_sw = df.copy()
df_sw['text'] = df_sw['text'].apply(remove_stopwords)


In [None]:
plot_histograms(df_sw, "(be stopwords)")
plot_wordcloud(df_sw, 1, "SPAM (be stopwords)")
plot_wordcloud(df_sw, 0, "HAM (be stopwords)")

top_spam_sw = get_top_words(df_sw, 1)
top_ham_sw = get_top_words(df_sw, 0)
print("Top SPAM (be stopwords):", top_spam_sw)
print("Top HAM (be stopwords):", top_ham_sw)


# Vektorizavimas/mokymas


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=5),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}
nlp = spacy.load("en_core_web_md")

def vectorize_text(df, method="bow", vectorizer=None):
    texts = df['text'].astype(str).tolist()

    if method == "bow":
        if vectorizer is None:
            vectorizer = CountVectorizer()
            X = vectorizer.fit_transform(texts)   # treniruojam + kuriam zodyna
        else:
            X = vectorizer.transform(texts)       # naudojam esama zodyna

    elif method == "tfidf":
        if vectorizer is None:
            vectorizer = TfidfVectorizer()
            X = vectorizer.fit_transform(texts)
        else:
            X = vectorizer.transform(texts)

    elif method == "spacy_w2v":
        # spacy nereikia vectorizer
        X = np.array([nlp(t).vector for t in texts])
        vectorizer = None
    else:
        raise ValueError("Ne≈æinomas metodas")

    return X, vectorizer

def train_and_evaluate(models, X_train, X_test, y_train, y_test, title=""):
    results = []
    print(f"\n=== {title} ===")
    for name, model in models.items():
        print(f"üîπ Treniruojamas: {name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results.append((name, acc))
        print(f"‚úÖ Tikslumas: {acc:.4f}")
        print(classification_report(y_test, y_pred, digits=3))
        print("-"*50)
    return pd.DataFrame(results, columns=["Model", "Accuracy"]).sort_values(by="Accuracy", ascending=False)

def plot_results(df, title=""):
    df = df.sort_values(by="Accuracy", ascending=True)
    plt.figure(figsize=(10,6))
    plt.barh(df["Model"], df["Accuracy"], color='skyblue')
    for i, v in enumerate(df["Accuracy"]):
        plt.text(v + 0.002, i, f"{v:.3f}", va='center')
    plt.xlabel("Tikslumas")
    plt.title(title)
    plt.grid(axis='x', linestyle='--', alpha=0.6)
    plt.show()

# Train/Test padalinimas bei mokymas

In [None]:
y = df['label'].values
results_all = {}
for method in ["bow", "tfidf", "spacy_w2v"]:
    print(f"\n\n### Vektorizacija: {method.upper()} ###")
    for data, label in [(df, "original"), (df_sw, "no_stopwords")]:
        X, vectorizer = vectorize_text(data, method)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
        results = train_and_evaluate(models, X_train, X_test, y_train, y_test, title=f"{method.upper()} - {label}")
        results_all[f"{method}_{label}"] = results
        plot_results(results, title=f"{method.upper()} - {label}")

In [None]:
comparison_df = pd.DataFrame({"Model": list(models.keys())})
for key, df_res in results_all.items():
    comparison_df = comparison_df.merge(df_res, on="Model", how="left", suffixes=("", f"_{key}"))

comparison_df.columns = ["Model"] + [f"Accuracy_{k}" for k in results_all.keys()]

x = np.arange(len(comparison_df))
width = 0.12
plt.figure(figsize=(15, 7))

# Gra≈æesnƒó spalv≈≥ paletƒó
colors = ["#4C72B0", "#55A868", "#C44E52", "#8172B2", "#CCB974", "#64B5CD"]

n_models = len(comparison_df)
n_methods = len(comparison_df.columns) - 1
x = np.arange(n_models)
width = 0.8 / n_methods

for i, col in enumerate(comparison_df.columns[1:]):
    plt.bar(
        x + i * width - (width * (n_methods - 1) / 2),
        comparison_df[col],
        width,
        label=col.replace("Accuracy_", "").upper(),  # <-- FIX
        alpha=0.9,
        edgecolor="black",
        linewidth=0.7,
        color=colors[i % len(colors)]               # <-- spalvos dabar veiks
    )


plt.xticks(x, comparison_df["Model"], rotation=45, ha='right')
plt.ylabel("Tikslumas", fontsize=13)
plt.title("Modeli≈≥ palyginimas pagal vektorizacijos metodus", fontsize=15, pad=10)
plt.ylim(0, 1.05)

plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.legend(title="Vektorizacija", title_fontsize=12, fontsize=10)

plt.tight_layout()
plt.show()

# Ensemble

In [None]:
base_models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=5)),
    ('Linear SVM', LinearSVC(random_state=5)),
    ('Random Forest', RandomForestClassifier(n_estimators=200, random_state=5))
]

In [None]:
X, vectorizer = vectorize_text(df, 'bow')
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
meta_model_svc = LogisticRegression(max_iter=1000, random_state=5)

stack_svc = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model_svc,
    passthrough=False,
    cv=5
)

stack_svc.fit(X_train, y_train)
y_pred = stack_svc.predict(X_test)

print("‚úÖ Stacking tikslumas:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))


# Testavimas su https://archive.ics.uci.edu/dataset/228/sms+spam+collection

In [None]:
data = pd.read_csv('SMSSpamCollection',delimiter="\t",header=None)
data.columns = ["label","text"]

data['label'] = data['label'].map({'spam': 1, 'ham': 0})

In [None]:
X_df, vectorizer = vectorize_text(df, 'tfidf', vectorizer = vectorizer)

X_data, vectorizer = vectorize_text(data, 'tfidf', vectorizer = vectorizer)

In [None]:
stack_svc.fit(X_df, df['label'])

y_pred_data = stack_svc.predict(X_data)

print(accuracy_score(data['label'], y_pred_data))