<a href="https://colab.research.google.com/github/Sujatha2108/Email-SMS-Spam-Detector/blob/main/Email_Spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers torch scikit_learn flask pyngrok huggingface_hub

In [None]:
from google.colab import userdata

# This securely gets your token
token = userdata.get('HF_TOKEN')

# ... and the rest of your login code
from huggingface_hub import login
login(token)

In [None]:
from huggingface_hub import whoami
print(whoami())

In [None]:
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("sms_spam")

In [None]:
 print("Dataset Keys : ",dataset.keys())
 print("Number of examples : ",len(dataset['train']))
 print("\n Sample data points:")
 for i in range(3):
  print(f"Message {i+1} :")
  print("Text: ",dataset["train"][i]["sms"])
  print("Label:","spam"if dataset['train'][i]['label']==1 else  "Ham")
  print("-"*40)

In [None]:
df = pd.DataFrame(dataset['train'])
df['label_name'] = df['label'].apply(lambda x: 'spam' if x==1 else 'ham')
print("\n DataFrame head: ")
df.head()

In [None]:
df.tail()


In [None]:
df.describe()


In [None]:
df.info()

In [None]:
df.sample()

In [None]:
print("\nClass distribution:")
print(df['label_name'].value_counts())

In [None]:
import matplotlib.pyplot as plt
print("\nClass distribution:")
print(df['label_name'].value_counts().plot(kind='bar'))
plt.show()

In [None]:
print(df['label_name'].value_counts().plot(kind='box'))
plt.show()

In [None]:
print(df['label_name'].value_counts().plot(kind='pie'))

In [None]:
train_test=dataset['train'].train_test_split(test_size=0.2,stratify_by_column='label')
train_dataset=train_test['train']
test_dataset=train_test['test']

In [None]:
print("Train/Test split sizes:")
print("Train:", len(train_dataset))
print("Test:", len(test_dataset))

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_batch(batch):
    return tokenizer(batch["sms"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_batch, batched=True)
test_dataset = test_dataset.map(tokenize_batch, batched=True)


In [None]:

train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
!pip install evaluate

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = logits.argmax(axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:


# ✅ Training arguments - OLD compatible (no evaluation_strategy)
training_args = TrainingArguments(
    output_dir="./results",
    save_steps=500,                    # save checkpoint every 500 steps
    logging_dir="./logs",
    logging_steps=100,                 # log every 100 steps
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to=[]                       # disables wandb auto logging
)

# Trainer (shows tqdm automatically)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 🚀 Train with tqdm
trainer.train()

# ✅ Evaluate manually at the end
metrics = trainer.evaluate()
print("Final evaluation metrics:", metrics)

In [None]:
model.push_to_hub("email-spam-detector-distilbert")
tokenizer.push_to_hub("email-spam-detector-distilbert")

In [None]:
%%writefile app.py
from flask import Flask, render_template, request
from transformers import pipeline

app = Flask(__name__)

# Load pipeline from your fine-tuned model on Hugging Face
spam_clf = pipeline("text-classification", model="SujathaAlugoju/email-spam-detector-distilbert")

@app.route("/", methods=["GET", "POST"])
def home():
    result = ""
    message = ""
    if request.method == "POST":
        message = request.form.get("message", "").strip()
        if message:
            pred = spam_clf(message)[0]
            label = "🚨 Spam ❌" if pred['label'] == "LABEL_1" else "✅ Not Spam"
            result = f"{label} (Confidence: {pred['score']:.2f})"
    return render_template("index.html", result=result, message=message)

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=8000)


In [None]:
!mkdir -p templates

In [None]:
!mkdir -p static

In [None]:
%%writefile templates/index.html
<!DOCTYPE html>
<html>
<head>
    <title>📧 Spam Detector</title>
    <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
</head>
<body>
    <div class="container">
        <h1>📧 Email / SMS Spam Detector</h1>
        <form method="post">
            <textarea name="message" rows="5" placeholder="Enter your message here...">{{ message }}</textarea>
            <button type="submit">Check 🚀</button>
        </form>

        {% if result %}
        <div class="result">
            <h2>{{ result }}</h2>
        </div>
        {% endif %}
    </div>
</body>
</html>


In [None]:
%%writefile static/style.css
body {
    font-family: 'Segoe UI', sans-serif;
    background: linear-gradient(135deg, #141E30, #243B55);
    color: #fff;
    display: flex;
    justify-content: center;
    align-items: center;
    height: 100vh;
    margin: 0;
}

.container {
    text-align: center;
    width: 50%;
    background: rgba(255, 255, 255, 0.1);
    padding: 30px;
    border-radius: 16px;
    box-shadow: 0 4px 20px rgba(0,0,0,0.5);
}

h1 {
    margin-bottom: 20px;
    color: #FFD700;
}

textarea {
    width: 90%;
    padding: 12px;
    border-radius: 10px;
    border: none;
    outline: none;
    font-size: 16px;
    margin-bottom: 15px;
}

button {
    background: #FFD700;
    color: #000;
    font-weight: bold;
    padding: 12px 20px;
    border-radius: 8px;
    border: none;
    cursor: pointer;
    transition: background 0.3s ease-in-out;
}

button:hover {
    background: #FFA500;
}

.result {
    margin-top: 20px;
    font-size: 20px;
    font-weight: bold;
    padding: 15px;
    border-radius: 12px;
    background: rgba(0, 0, 0, 0.4);
}


In [None]:
# ✅ Kill any running Flask/ngrok processes
!pkill -f flask || echo "No flask running"
!pkill -f ngrok || echo "No ngrok running"

In [None]:
# ✅ Check if port 8000 is occupied
!lsof -i :8000 || echo "Port 8000 is free"

In [None]:
# (Optional) If any PID shows up in the above output, kill it:
!kill -9 23863

In [None]:
# ✅ Run Flask in the background and log output
!nohup python app.py > flask.log 2>&1 &

In [None]:
!tail -n 50 flask.log


In [67]:
# ✅ Start ngrok tunnel
from pyngrok import ngrok, conf
conf.get_default().auth_token = "NGROK_TOKEN"

public_url = ngrok.connect(8000)
print("🌍 Public URL:", public_url)

# ✅ Check Flask logs (useful if error happens)
!sleep 3 && tail -n 20 flask.log

🌍 Public URL: NgrokTunnel: "https://rex-hottish-noncontrollablely.ngrok-free.dev" -> "http://localhost:8000"

--- Last 20 lines of Flask Log ---
