In [13]:
import os
import os.path
import json
import pandas as pd
import glob

In [None]:
# at colab we need to unzip
# !unzip dataset.zip -d .

In [10]:
# read and load a file
# . represents the relative path, change it to 'contents' in colab notebook: 'contents/dataset/
spam_sample = pd.read_json('dataset/SPAM/sample_01.json', typ='series')

In [14]:
print(spam_sample)

data    Hola, Torneos:\r\n\r\nExplorar más grupos\r\nh...
dtype: object


In [21]:
# read and load a set of files
path_to_files = './dataset/NO_SPAM/'
json_pattern = os.path.join(path_to_files,'*.json')
file_list = glob.glob(json_pattern)

dfs = [] # an empty list to store the data frames

for file in file_list:
    data = pd.read_json(file, typ='series') # read data frame from json file
    dfs.append(data) # append the data frame to the list

no_spam = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.

In [29]:
no_spam.name

In [23]:
# exploring data
no_spam.describe()

count                                                    65
unique                                                   65
top       [image: Google]\r\nA new sign-in on Windows\r\...
freq                                                      1
dtype: object

In [None]:
# install nltk in colab notebook
# !pip install nltk

## Exploring and cleaning our dataset

In [33]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string, re
import nltk
nltk.download('omw-1.4')

def filter_words(raw_comments):
    """ Returns a filtered list
        Removes numbers and symbols from each comment
    """
    table = str.maketrans('', '', string.punctuation + string.digits)
    stripped = [comment.translate(table) for comment in raw_comments]
    return stripped

def lowercase_all(comments):
    """ Returns a list of lowercased words
    """
    return [comment.lower() for comment in comments]

def split_words(comments):
    """ Splits each comment as a list of strings, where each string is a word
    """
    return [re.split(r'\W+', comment) for comment in comments]

def flatten_words(comments):
    """ Returns a list of strings, where each string is a word
    """
    return [word for comment in comments for word in comment]

def remove_stopwords(words, language='english'):
    """ Returns a list filter out stop words
    """
    stop_words = set(stopwords.words(language))
    return [word for word in words if not word in stop_words]

def lemmatize(words):
    """ Lemma conversion
    """
    lem = WordNetLemmatizer()
    return [lem.lemmatize(word) for word in words]

def stemming(words):
    """ Linguistic normalization
    """
    ps = PorterStemmer()
    return [ps.stem(word) for word in words]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ruben\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


In [30]:
def create_vocabulary(data):
    """Pipeline to produce a word vocabulary

    Args:
        data (pandas dataframe): Dataframe containing your text data

    Returns:
        list: a word vocabulary
    """
    
    # strip reviews and place them in a single list
    raw_comments = list(data)

    # Create vocabulary
    filtered = filter_words(raw_comments)
    splitted = split_words(filtered)
    flattened = flatten_words(splitted)
    words = lowercase_all(flattened)
    words = remove_stopwords(words)
    words = lemmatize(words)
    words = stemming(words)
    return sorted(list(set(words)))

In [34]:
create_vocabulary(no_spam)

['',
 'aadrel',
 'abandon',
 'abil',
 'abl',
 'aboard',
 'abou',
 'abril',
 'abrog',
 'academ',
 'academi',
 'academia',
 'academiaedu',
 'accept',
 'acceso',
 'access',
 'accordingli',
 'account',
 'accountcopi',
 'aceptar',
 'acerca',
 'achiev',
 'acid',
 'across',
 'action',
 'activ',
 'activar',
 'actualizacion',
 'actualización',
 'acumul',
 'ad',
 'add',
 'addit',
 'address',
 'addresshead',
 'ademá',
 'adfre',
 'adjuntar',
 'adjunto',
 'adob',
 'advantag',
 'adventur',
 'advic',
 'advis',
 'advisori',
 'aeromexico',
 'affect',
 'affili',
 'age',
 'agenda',
 'agendar',
 'agent',
 'agil',
 'agre',
 'agregado',
 'agregar',
 'agregu',
 'ahora',
 'ai',
 'al',
 'alan',
 'album',
 'alex',
 'alguien',
 'alguna',
 'alguno',
 'aliado',
 'aligncent',
 'alignleft',
 'alignright',
 'allaround',
 'allow',
 'alreadi',
 'also',
 'alt',
 'alta',
 'altandroid',
 'altcustom',
 'altemail',
 'altfacebook',
 'althttpssimagesiherbcomsnssnsmjpg',
 'altiherb',
 'altinstagram',
 'altitun',
 'altlabrada',