<a href="https://colab.research.google.com/github/ValdazoAmerico/data-augmentation/blob/main/data_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import string
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv("/content/SMSSpamCollection.txt", sep="\t", header=None)
print(df.shape)
# renaming the columns
df.rename(columns={0: 'label', 1: 'text'}, inplace=True)
df.head()

(5572, 2)


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:

df.isnull().sum()

label    0
text     0
dtype: int64

In [None]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [None]:
# calculating length of each record in our df
df['length'] = df['text'].apply(lambda x: len(x))
df.head()

Unnamed: 0,label,text,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [None]:
# some text cleaning functions
def convert_to_lower(text):
    return text.lower()

def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

In [None]:
df['text'] = df['text'].apply(lambda x: convert_to_lower(x))
df['text'] = df['text'].apply(lambda x: remove_numbers(x))
df['text'] = df['text'].apply(lambda x: remove_punctuation(x))
df['text'] = df['text'].apply(lambda x: remove_stopwords(x))
df['text'] = df['text'].apply(lambda x: remove_extra_white_spaces(x))
df['text'] = df['text'].apply(lambda x: lemmatizing(x))

In [None]:

# calculating length of each record after performing text cleaning
df['length_after_cleaning'] = df['text'].apply(lambda x: len(x))
df.head()

Unnamed: 0,label,text,length,length_after_cleaning
0,ham,go jurong point crazy available bugis great wo...,111,78
1,ham,ok lar joking wif oni,29,21
2,spam,free entry wkly comp win fa cup final tkts st ...,155,101
3,ham,u dun say early hor c already say,49,33
4,ham,nah dont think go usf life around though,61,40


In [None]:

# converting string labels to int labels

label_map = {
    'ham': 0,
    'spam': 1,
}

df['label'] = df['label'].map(label_map)
df.head()

Unnamed: 0,label,text,length,length_after_cleaning
0,0,go jurong point crazy available bugis great wo...,111,78
1,0,ok lar joking wif oni,29,21
2,1,free entry wkly comp win fa cup final tkts st ...,155,101
3,0,u dun say early hor c already say,49,33
4,0,nah dont think go usf life around though,61,40


In [None]:
df['label'].value_counts()

0    4825
1     747
Name: label, dtype: int64

In [None]:
tf_without_balancing = TfidfVectorizer()
X_tf_wob = tf_without_balancing.fit_transform(df['text'])
X_tf_wob = X_tf_wob.toarray()

In [None]:
print(X_tf_wob.shape)
print(X_tf_wob)

(5572, 7906)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
# splitting dataset
X_train_tf_wob, X_test_tf_wob, y_train_tf_wob, y_test_tf_wob = train_test_split(X_tf_wob, df['label'].values, test_size=0.2)

In [None]:
# initializing model
naiveBayes_wob = GaussianNB()

In [None]:
naiveBayes_wob.fit(X_train_tf_wob, y_train_tf_wob)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
y_pred_tf_wob = naiveBayes_wob.predict(X_test_tf_wob)

In [None]:
print(accuracy_score(y_test_tf_wob, y_pred_tf_wob))

0.8645739910313901


In [None]:
print(classification_report(y_test_tf_wob, y_pred_tf_wob))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92       963
           1       0.50      0.86      0.63       152

    accuracy                           0.86      1115
   macro avg       0.74      0.86      0.77      1115
weighted avg       0.91      0.86      0.88      1115



# WITH AUGMENTATION

In [None]:
!pip install transformers
!pip install nlpaug

Collecting transformers
  Downloading transformers-4.12.2-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.3 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 36.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 32.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 27.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempti

In [None]:
import nlpaug.augmenter.word.context_word_embs as aug

In [None]:
sample_text = df['text'].iloc[101]

In [None]:
sample_text

'okay name ur price long legal wen pick ave am xx'

In [None]:
augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
augmenter.augment("hola buenas tardes")

  cpuset_checked))


'hola of buenas tardes'

In [None]:
augmented_sample_text

'okay name s ur price long for legal wen pick ave she am xx'

In [None]:
for i in range(5):
    print(augmenter.augment(sample_text))

  cpuset_checked))


okay good name ur price long legal term wen pick ave am, xx
okay if name ur price long legal wen pick the ave ii am xx
okay name ur price long c legal wen pick ave & am d xx
okay name ur price long'legal wen pick yer ave no am xx
okay call name ur : price long legal wen pick'ave am xx


In [None]:
df['label'].value_counts()

0    4825
1     747
Name: label, dtype: int64

In [None]:
def augmentMyData(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    spam_df = df[df['label'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(spam_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(spam_df['text'].iloc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'label': 1,
        'text': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

In [None]:
new_df = df.drop(labels=['length', 'length_after_cleaning'], axis=1)
new_df.head()

Unnamed: 0,label,text
0,0,go jurong point crazy available bugis great wo...
1,0,ok lar joking wif oni
2,1,free entry wkly comp win fa cup final tkts st ...
3,0,u dun say early hor c already say
4,0,nah dont think go usf life around though


In [None]:
aug_df = augmentMyData(new_df, augmenter, samples=200)

  0%|          | 0/200 [00:00<?, ?it/s]

  cpuset_checked))


In [None]:
X_train, _, y_train, _ = train_test_split(aug_df['text'], aug_df['label'].values, test_size=0.1)
_, X_test, _, y_test = train_test_split(df['text'], df['label'].values, test_size=0.5)

In [None]:
tf_with_aug = TfidfVectorizer()
X_train_tf = tf_with_aug.fit_transform(X_train)
X_train_tf = X_train_tf.toarray()

In [None]:
nb = GaussianNB()
nb.fit(X_train_tf, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
X_test_tf = tf_with_aug.transform(X_test)
X_test_tf = X_test_tf.toarray()

In [None]:
X_train_tf.shape, X_test_tf.shape

((5194, 7638), (2786, 7638))

In [None]:
y_preds = nb.predict(X_test_tf)

In [None]:
print(confusion_matrix(y_test, y_preds))
print(accuracy_score(y_test, y_preds))
print(classification_report(y_test, y_preds))

[[2185  230]
 [   2  369]]
0.9167264895908112
              precision    recall  f1-score   support

           0       1.00      0.90      0.95      2415
           1       0.62      0.99      0.76       371

    accuracy                           0.92      2786
   macro avg       0.81      0.95      0.86      2786
weighted avg       0.95      0.92      0.92      2786



# RandomOverSampler

In [None]:
!pip install imbalanced-learn



In [None]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler



In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'].values, test_size=0.25)

In [None]:
Counter(y_train)

Counter({0: 3593, 1: 586})

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
X_train_tf = vectorizer.transform(X_train)

In [None]:
X_train_tf = X_train_tf.toarray()
X_train_tf.shape

(4179, 6731)

In [None]:
X_test_tf = vectorizer.transform(X_test)
X_test_tf = X_test_tf.toarray()

In [None]:
overSampler = RandomOverSampler(0.5)
X_train_os, y_train_os = overSampler.fit_sample(X_train_tf, y_train)



In [None]:
Counter(y_train_os)

Counter({0: 3593, 1: 1796})

In [None]:
nb = GaussianNB()
nb.fit(X_train_os, y_train_os)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
y_preds = nb.predict(X_test_tf)

In [None]:
print(confusion_matrix(y_test, y_preds))
print(accuracy_score(y_test, y_preds))
print(classification_report(y_test, y_preds))

[[1089  143]
 [  26  135]]
0.8786791098348887
              precision    recall  f1-score   support

           0       0.98      0.88      0.93      1232
           1       0.49      0.84      0.62       161

    accuracy                           0.88      1393
   macro avg       0.73      0.86      0.77      1393
weighted avg       0.92      0.88      0.89      1393

