# ML Part 2: Text Data Cleaning, Feature Engineering, and Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np

In [2]:
info = pd.read_excel("Numeric Cleaned Data.xlsx")
info.info()
info.Description.head()
info.Description.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185 entries, 0 to 1184
Data columns (total 4 columns):
ASIN                    1185 non-null object
Description             1185 non-null object
Price                   1185 non-null float64
Verified Subcategory    1185 non-null object
dtypes: float64(1), object(3)
memory usage: 37.2+ KB


1180    【Fast Charging & USB 3.0 Speed】Charge up your ...
1181    📍Updated Charging & 10 Gbps Data Transmission ...
1182    📱 [USB-C (Male) to Micro USB (Female)]: The US...
1183    【USB On-The-Go】: Plug in and use computer peri...
1184    Conver USB A Devices --- Use the adapter to co...
Name: Description, dtype: object

In [3]:
'''
    We want to return words to their root form, take out stop words and special chars
'''
import spacy
import string 
nlp=spacy.load("en_core_web_sm")
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

'''

def all_alpha(tokens):
    return [token for token in tokens if token.isalpha()]

def cleaning(paragraph):
    lem = lemmatizing(paragraph)
    ns = no_stop(lem)
    al = all_alpha(ns)
    return " ".join(al)
'''

'\n\ndef all_alpha(tokens):\n    return [token for token in tokens if token.isalpha()]\n\ndef cleaning(paragraph):\n    lem = lemmatizing(paragraph)\n    ns = no_stop(lem)\n    al = all_alpha(ns)\n    return " ".join(al)\n'

In [4]:
'''
    First, tokenize the description
'''
info['Description']=info['Description'].apply(nlp)
info['Description'].head()
info['Description'].tail()

1180    (【, Fast, Charging, &, USB, 3.0, Speed】Charge,...
1181    (📍, Updated, Charging, &, 10, Gbps, Data, Tran...
1182    (📱, [, USB, -, C, (, Male, ), to, Micro, USB, ...
1183    (【, USB, On, -, The, -, Go, 】, :, Plug, in, an...
1184    (Conver, USB, A, Devices, ---, Use, the, adapt...
Name: Description, dtype: object

In [5]:
'''
    Then we will try to remove punctuation
'''
new_puncts = string.punctuation.replace('-', '')
print(new_puncts)

def no_punct(tokens):
    #print(str(tokens[0]))
    return [token for token in tokens if (str(token) not in new_puncts)]

info['Description_New']  = info['Description'].apply(no_punct)
info['Description_New'].head()
info['Description_New'].tail()


!"#$%&'()*+,./:;<=>?@[\]^_`{|}~


1180    [【, Fast, Charging, USB, 3.0, Speed】Charge, up...
1181    [📍, Updated, Charging, 10, Gbps, Data, Transmi...
1182    [📱, USB, -, C, Male, to, Micro, USB, Female, T...
1183    [【, USB, On, -, The, -, Go, 】, Plug, in, and, ...
1184    [Conver, USB, A, Devices, ---, Use, the, adapt...
Name: Description_New, dtype: object

In [6]:
def lower_case(tokens):
    return nlp(" ".join([token.lower_ for token in tokens if token.is_ascii]))

info['Description_New'] = info['Description_New'].apply(lower_case)

info['Description_New'].head()
info['Description_New'].tail()

1180    (fast, charging, usb, 3.0, up, your, phone, ip...
1181    (updated, charging, 10, gbps, data, transmissi...
1182    (usb, -, c, male, to, micro, usb, female, the,...
1183    (usb, on, -, the, -, go, plug, in, and, use, c...
1184    (conver, usb, a, devices, ---, use, the, adapt...
Name: Description_New, dtype: object

In [7]:
def lemmatizing(tokens):
    lemmatized = [token.lemma_ for token in tokens]
    return lemmatized

info['Description_New'] = info['Description_New'].apply(lemmatizing)

info['Description_New'].head()
info['Description_New'].tail()


1180    [fast, charge, usb, 3.0, up, -PRON-, phone, ip...
1181    [update, charge, 10, gbps, datum, transmission...
1182    [usb, -, c, male, to, micro, usb, female, the,...
1183    [usb, on, -, the, -, go, plug, in, and, use, c...
1184    [conver, usb, a, device, ---, use, the, adapte...
Name: Description_New, dtype: object

In [8]:
def join_words(tokens):
    return " ".join(tokens).replace(" - ", "-").replace(' -PRON-',"")

info['Description_New'] = info['Description_New'].apply(join_words)

info['Description_New'].head()
info['Description_New'].tail()

1180    fast charge usb 3.0 up phone ipad or any other...
1181    update charge 10 gbps datum transmission speed...
1182    usb-c male to micro usb female the usb c to us...
1183    usb on-the-go plug in and use computer periphe...
1184    conver usb a device --- use the adapter to con...
Name: Description_New, dtype: object

In [14]:
info.to_excel("Final Cleaned Data.xlsx", index=False)