##### Social Media Analytics
### Introduction to Text Mining
## Text Annotation
 Best Buy by Eva

### Initial setup

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

In [2]:
dtypes = {
    'device': 'category',
    'user': 'category',
    'rating': 'float64',
    'text': 'string',
    'data': 'category',
    'ownership_length': 'category',
}

ds = pd.read_excel(
    "ExtractedReviewsDataCollection_bestbuy.xlsx",
    dtype=dtypes,
 )

### Functions

In [3]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [4]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

### Analysis

In [5]:
# Create a dataframe with only the description
processedReviews =  pd.DataFrame(data=ds.text.apply(textPreProcess).values, index=ds.index, columns=['PreProcessedText']) 



In [6]:
# Tokenize text
processedReviews['Words'] =  processedReviews['PreProcessedText'].apply(tokenize_words)

#### English

In [7]:
processedReviews.index

RangeIndex(start=0, stop=374, step=1)

In [8]:
print(len(processedReviews.Words))

374


In [9]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Adventure.4.Eva.r\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
tags = nltk.pos_tag(processedReviews.Words.iloc[37])
print(tags)

[('phone', 'NN'), ('is', 'VBZ'), ('fine', 'JJ'), ('purchase', 'NN'), ('process', 'NN'), ('was', 'VBD'), ('a', 'DT'), ('disaster', 'NN'), ('salesman', 'NN'), ('was', 'VBD'), ('distracted', 'VBN'), ('interrupted', 'VBN'), ('by', 'IN'), ('former', 'JJ'), ('clients', 'NNS'), ('and', 'CC'), ('other', 'JJ'), ('sales', 'NNS'), ('people', 'NNS'), ('never', 'RB'), ('had', 'VBD'), ('his', 'PRP$'), ('full', 'JJ'), ('attention', 'NN'), ('he', 'PRP'), ('apparently', 'RB'), ('entered', 'VBD'), ('erroneous', 'JJ'), ('information', 'NN'), ('now', 'RB'), ('leading', 'VBG'), ('to', 'TO'), ('a', 'DT'), ('follow', 'JJ'), ('up', 'RP'), ('visit', 'NN'), ('at', 'IN'), ('a', 'DT'), ('different', 'JJ'), ('store', 'NN'), ('never', 'RB'), ('again', 'RB')]


In [11]:
# Filter only Nouns
nouns = []
for tag in tags:
    if tag[1][0]=="N":  # if if starts with a "N"
        nouns.append(tag[0])
print(nouns)

['phone', 'purchase', 'process', 'disaster', 'salesman', 'clients', 'sales', 'people', 'attention', 'information', 'visit', 'store']


#### Adjectives

In [12]:
adjectives = []
for tag in tags:
    if tag[1][0]=="J":  # if it starts with a "J"
        adjectives.append(tag[0])
print(adjectives)

['fine', 'former', 'other', 'full', 'erroneous', 'follow', 'different']


#### verbs

In [13]:
verbs = []
for tag in tags:
    if tag[1][0] == "V":  # if it starts with a "V"
        verbs.append(tag[0])
print(verbs)

['is', 'was', 'was', 'distracted', 'interrupted', 'had', 'entered', 'leading']
