In [5]:
import pandas as pd
import requests
import io  # Corrected: io is a top-level module
import os

# Define the local path
DATA_DIR = "C:/Users/AYUSH/Desktop/CT08/"
DATA_PATH = os.path.join(DATA_DIR, "spam_data.csv")

def download_and_save():
    # Stable mirror for the SMS Spam dataset
    url = "https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv"
    
    try:
        # Create the directory if it doesn't exist (Professional safety check)
        if not os.path.exists(DATA_DIR):
            os.makedirs(DATA_DIR)
            
        print("Connecting to server...")
        response = requests.get(url)
        response.raise_for_status()
        
        # We use io.BytesIO because the raw content is binary (bytes)
        # encoding='latin-1' is required because this dataset has special characters
        print("Processing data...")
        df = pd.read_csv(io.BytesIO(response.content), encoding='latin-1')
        
        # The raw Kaggle file often has empty 'Unnamed' columns; we drop them
        df = df[['v1', 'v2']] 
        df.columns = ['label', 'message']
        
        # Save locally to your project folder
        df.to_csv(DATA_PATH, index=False)
        print(f"--- SUCCESS ---")
        print(f"File created at: {DATA_PATH}")
        print(f"Total rows downloaded: {len(df)}")
        
    except Exception as e:
        print(f"--- FAILED ---")
        print(f"Error Details: {e}")

if __name__ == "__main__":
    download_and_save()

Connecting to server...
Processing data...
--- SUCCESS ---
File created at: C:/Users/AYUSH/Desktop/CT08/spam_data.csv
Total rows downloaded: 5572


In [10]:
import pandas as pd
import requests
import io
import os

DATA_DIR = "C:/Users/AYUSH/Desktop/CT08/"
FULL_PATH = os.path.join(DATA_DIR, "spam_data.csv")

def initialize_dataset():
    url = "https://raw.githubusercontent.com/amankharwal/SMS-Spam-Detection/master/spam.csv"
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status() 
        df = pd.read_csv(io.BytesIO(response.content), encoding='latin-1')
        
        # --- ROBUST COLUMN DETECTION ---
        # Professional tip: Look for content rather than just names
        print(f"Original columns found: {df.columns.tolist()}")
        
        # If 'v1' isn't there, try to find columns by index
        if 'v1' in df.columns and 'v2' in df.columns:
            df = df[['v1', 'v2']]
        else:
            # Take the first two columns regardless of their names
            df = df.iloc[:, :2] 
            
        df.columns = ['label', 'message']
        
        # Remove any empty rows (Data Cleaning)
        df = df.dropna()
        
        if not os.path.exists(DATA_DIR):
            os.makedirs(DATA_DIR)
            
        df.to_csv(FULL_PATH, index=False)
        print(f"SUCCESS: Dataset standardized and saved to {FULL_PATH}")

    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    initialize_dataset()

Original columns found: ['class', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
SUCCESS: Dataset standardized and saved to C:/Users/AYUSH/Desktop/CT08/spam_data.csv


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# 1. LOAD FROM LOCAL DISK
path = "C:/Users/AYUSH/Desktop/CT08/spam_data.csv"
df = pd.read_csv(path)

# 2. PRE-PROCESSING
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# 3. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], 
    df['label_num'], 
    test_size=0.2, 
    random_state=42
)

# 4. BUILD THE ML PIPELINE
# Professionals use Pipelines to keep the code clean and deployable
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', MultinomialNB())
])

# 5. TRAIN AND EVALUATE
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print("--- MODEL PERFORMANCE ---")
print(classification_report(y_test, predictions))

--- MODEL PERFORMANCE ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

