# Messages Monitoring Model

## Phase 1: Preprocessing the Data

### Load in the Required Libraries

In [68]:
# for dealing with data
import pandas as pd
import numpy as np
from datasets import load_dataset

# for data visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# for processing data
from sklearn.model_selection import train_test_split

# for evaluation metrics
from sklearn.metrics import accuracy_score

### Load in the Datasets

In [5]:
# download the kaggle datasets
# uncomment the following lines to download the datasets
# !kaggle datasets download -d uciml/sms-spam-collection-dataset
# !kaggle datasets download -d team-ai/spam-text-message-classification
# !wget https://raw.githubusercontent.com/DeshDSingh/SMS-SPAM-Detection/master/sms_spam.csv
# !unzip sms-spam-collection-dataset.zip
# !unzip spam-text-message-classification.zip
# !rm sms-spam-collection-dataset.zip
# !rm spam-text-message-classification.zip
# !mv sms_spam.csv ./../data

Downloading sms-spam-collection-dataset.zip to /Users/necro_kudo/Developer/misk-dsi/capstone-project
100%|█████████████████████████████████████████| 211k/211k [00:00<00:00, 680kB/s]
100%|█████████████████████████████████████████| 211k/211k [00:00<00:00, 678kB/s]
Downloading spam-text-message-classification.zip to /Users/necro_kudo/Developer/misk-dsi/capstone-project
100%|█████████████████████████████████████████| 208k/208k [00:00<00:00, 472kB/s]
100%|█████████████████████████████████████████| 208k/208k [00:00<00:00, 471kB/s]
Archive:  sms-spam-collection-dataset.zip
  inflating: spam.csv                
Archive:  spam-text-message-classification.zip
  inflating: SPAM text message 20170820 - Data.csv  


In [81]:
# read in the datasets
sms_data1 = pd.read_csv("../data/SPAM text message 20170820 - Data.csv")
sms_data2 = pd.read_csv("../data/spam.csv", encoding='latin-1')
sms_data3 = pd.DataFrame((load_dataset("sms_spam")['train']))
sms_data4 = pd.read_csv("./../data/sms_spam.csv")

Reusing dataset sms_spam (/Users/necro_kudo/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c)
100%|██████████| 1/1 [00:00<00:00, 304.95it/s]


In [82]:
# unify the features names
sms_data1.rename({"Category": "label", "Message": "message"}, axis=1, inplace=True)
sms_data1 = sms_data1[["message", "label"]]

# unify the features names & drop unwanted features
sms_data2.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1, inplace=True)
sms_data2.rename({"v1" : "label", "v2" : "message"}, axis=1, inplace=True)
sms_data2 = sms_data2[["message", "label"]]

# unify the features names
sms_data3.rename({"sms" : "message"}, axis=1, inplace=True)
sms_data3['label'] = sms_data3['label'].apply(lambda x: "spam" if x == 1 else "ham")

# unify the features names
sms_data4.rename({"type" : "label", "text" : "message"}, axis=1, inplace=True)
sms_data4 = sms_data4[["message", "label"]]

In [83]:
# concatenate the results to form a bigger dataframe of all the previous ones
sms_big = pd.concat([sms_data1, sms_data2, sms_data3, sms_data4], axis=0)
sms_big.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22277 entries, 0 to 5558
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  22277 non-null  object
 1   label    22277 non-null  object
dtypes: object(2)
memory usage: 522.1+ KB


Great! Since we concatenated the dataframes together, we need to check for duplicates if present, and drop them.

In [84]:
# check for duplicates
print(f"Number of observations before dropping duplicates: {len(sms_big)}")
print(f"Number of duplicated observations: {sms_big.duplicated().sum()}")
sms_big.drop_duplicates(inplace=True)
print(f"Number of observations after dropping the duplicates: {len(sms_big)}")


Number of observations before dropping duplicates: 22277
Number of duplicated observations: 10579
Number of observations after dropping the duplicates: 11698


Even though the number of duplicates is alarming (almost 1/2 of the concatenated data), luckily, dropping them will leave us with still a large dataset of over 10K observations

In [85]:
# examine the number of observations for each class
sms_big.label.value_counts()

ham     10124
spam     1574
Name: label, dtype: int64