In [1]:
import io
import requests
import zipfile
import pandas as pd

ytb_spam_df_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip'
sms_spam_df_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'

### EXTRACTING SPAM DATA FROM UCI ARCHIVES

souces:
- youtube
- sms 
- emails

In [2]:
def get_df_data_by_url(url: str) -> pd.DataFrame:
    response = requests.get(url)
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))
    csv_files = [file_name for file_name in zip_file.namelist() if file_name.endswith(".csv")]

    if len(csv_files) == 1:
        df = pd.read_csv(zip_file.open(csv_files[0]))

    elif len(csv_files) == 0:
        data_file = [file_name for file_name in zip_file.namelist() if '.' not in file_name]

        if len(data_file) > 1:
            df = pd.read_csv(zip_file.open(data_file[0]), sep='\t', header=None)

        else:
            raise ValueError("No CSV file found and no data file without extension found.")

    else:
        # Multiple CSV files, concatenate them into a single DataFrame
        csv_data = [
            zip_file.open(file_name).read().decode("latin-1")
            for file_name in csv_files
        ]

        df = pd.concat([pd.read_csv(io.StringIO(data)) for data in csv_data])
        del csv_data

    return df

In [3]:
ytb_df = get_df_data_by_url(ytb_spam_df_url)
sms_df = get_df_data_by_url(sms_spam_df_url)

### TRANSFORMING THE DATASETS
1. Remove unused labbels 
2. Transform categorical data to binary
3. Join the DataSets
4. Transform Dataframe to Vector

In [4]:
mapping = {'spam': 1, 'ham': 0}

sms_df.columns = ['class', 'body']
sms_df['source'] = 'sms'
sms_df['class'] = sms_df['class'].replace(mapping)

sms_df.head()

Unnamed: 0,class,body,source
0,0,"Go until jurong point, crazy.. Available only ...",sms
1,0,Ok lar... Joking wif u oni...,sms
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,sms
3,0,U dun say so early hor... U c already then say...,sms
4,0,"Nah I don't think he goes to usf, he lives aro...",sms


In [5]:
ytb_df = ytb_df.drop(columns=['COMMENT_ID', 'AUTHOR', 'DATE', 'Unnamed: 0'], axis=1)
ytb_df = ytb_df.rename(columns={'CLASS':'class', 'CONTENT': 'body'})
ytb_df['source'] = 'youtube'

In [6]:
spam_df = pd.concat([sms_df, ytb_df])
spam_df.head()

Unnamed: 0,class,body,source
0,0.0,"Go until jurong point, crazy.. Available only ...",sms
1,0.0,Ok lar... Joking wif u oni...,sms
2,1.0,Free entry in 2 a wkly comp to win FA Cup fina...,sms
3,0.0,U dun say so early hor... U c already then say...,sms
4,0.0,"Nah I don't think he goes to usf, he lives aro...",sms


#### Conveting the labels to Vectors

In [14]:
classes = spam_df['class'].tolist()
texts = spam_df['body'].tolist()

classes[120], texts[120]

(1.0,
 'PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires')

In [18]:
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

MAX_NUM_WORDS = 280
MAX_SEQ_LENGTH = 300

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)
y = to_categorical(np.asarray(classes))


### LOADING DATASET AND METADATA

#### Split & Export Vectors

In [None]:
import pickle
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_t2 = train_test_split(X, y, test_size=0.5, random_state=42)
X_test, X_valid, y_train, y_test = train_test_split(X_test, y_t2, test_size=0.33, random_state=42)

In [21]:
import pathlib

BASE_DIR = pathlib.Path().resolve().parent
EXPORT_DIR = BASE_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"

METADATA_EXPORT_PATH = EXPORT_DIR / 'spam-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'spam-tokenizer.json'

spam_df.to_csv(SPAM_DATASET_PATH)

training_data = {
    "X_train": X_train, 
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEQ_LENGTH,
    "label_legend": mapping,
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)