## Download Datasets

In [13]:
import pathlib

USE_PROJECT_ROOT = True
BASE_DIR = pathlib.Path(".").resolve()
if USE_PROJECT_ROOT:
    BASE_DIR = BASE_DIR.parent
DATASET_DIR = BASE_DIR / "datasets"
ZIPS_DIR = DATASET_DIR / 'zips'
EXPORT_DIR = DATASET_DIR / "exports"
SMS_SPAM_DIR = DATASET_DIR / 'imports' / 'sms-spam'
YOUTUBE_SPAM_DIR = DATASET_DIR / 'imports' / 'youtube-spam'

In [24]:
BASE_DIR

PosixPath('/home/cyrilng')

In [14]:
ZIPS_DIR.mkdir(exist_ok=True, parents=True)

EXPORT_DIR.mkdir(exist_ok=True, parents=True)

SMS_SPAM_DIR.mkdir(exist_ok=True, parents=True)

YOUTUBE_SPAM_DIR.mkdir(exist_ok=True, parents=True)

In [15]:
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip -o $ZIPS_DIR/uci-sms-spam.zip
!unzip -o $ZIPS_DIR/uci-sms-spam.zip -d $SMS_SPAM_DIR

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  198k  100  198k    0     0  87798      0  0:00:02  0:00:02 --:--:-- 87830
Archive:  /home/cyrilng/datasets/zips/uci-sms-spam.zip
  inflating: /home/cyrilng/datasets/imports/sms-spam/SMSSpamCollection  
  inflating: /home/cyrilng/datasets/imports/sms-spam/readme  


In [16]:
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip -o $ZIPS_DIR/uci-youtube-spam.zip
!unzip -o $ZIPS_DIR/uci-youtube-spam.zip -d $YOUTUBE_SPAM_DIR

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  159k  100  159k    0     0    97k      0  0:00:01  0:00:01 --:--:--   97k
Archive:  /home/cyrilng/datasets/zips/uci-youtube-spam.zip
  inflating: /home/cyrilng/datasets/imports/youtube-spam/Youtube01-Psy.csv  
  inflating: /home/cyrilng/datasets/imports/youtube-spam/__MACOSX/._Youtube01-Psy.csv  
  inflating: /home/cyrilng/datasets/imports/youtube-spam/Youtube02-KatyPerry.csv  
  inflating: /home/cyrilng/datasets/imports/youtube-spam/__MACOSX/._Youtube02-KatyPerry.csv  
  inflating: /home/cyrilng/datasets/imports/youtube-spam/Youtube03-LMFAO.csv  
  inflating: /home/cyrilng/datasets/imports/youtube-spam/__MACOSX/._Youtube03-LMFAO.csv  
  inflating: /home/cyrilng/datasets/imports/youtube-spam/Youtube04-Eminem.csv  
  inflating: /home/cyrilng/datasets/imports/youtube-spam/__MACOSX/._Youtube04-Eminem.csv  
  inflating: /home/cyr

## Load datasets into Pandas DataFrame

In [17]:
import pandas as pd

In [18]:
sms_path = SMS_SPAM_DIR / 'SMSSpamCollection'
sms_df = pd.read_csv(str(sms_path), sep='\t', header=None)

In [19]:
sms_df.columns = ['label', 'text']
sms_df['source'] = 'uci-spam-sms'

In [20]:
location = YOUTUBE_SPAM_DIR
csvs = list(location.glob("*.csv"))

In [21]:
new_dfs = []
for csv in csvs:
    csv_df = pd.read_csv(str(csvs[0]), usecols=['CLASS', 'CONTENT'])
    csv_df.rename(columns={'CLASS': 'class', "CONTENT": 'text'}, inplace=True)
    csv_df['label'] = csv_df['class'].apply(lambda x: "spam" if str(x) == "1" else "ham")
    sub_df = csv_df.copy()[['label', 'text']] 
    new_dfs.append(sub_df)

yt_df = pd.concat(new_dfs)
yt_df['source'] = 'uci-youtube-spam'

In [22]:
df = pd.concat([sms_df, yt_df])
df

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",uci-spam-sms
1,ham,Ok lar... Joking wif u oni...,uci-spam-sms
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,uci-spam-sms
3,ham,U dun say so early hor... U c already then say...,uci-spam-sms
4,ham,"Nah I don't think he goes to usf, he lives aro...",uci-spam-sms
...,...,...,...
433,spam,Like this comment for no reason﻿,uci-youtube-spam
434,ham,love this song﻿,uci-youtube-spam
435,spam,this song is awesome. these guys are the best....,uci-youtube-spam
436,spam,HOW MANY THUMBS UP FOR LOUIS SAVING THE DAY!?!?﻿,uci-youtube-spam


## Export complete dataset

In [23]:
df.to_csv(EXPORT_DIR / 'spam-dataset.csv', index=False)