In [1]:
#Handling the dataset
import pandas as pd
import pickle
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF 

#Natural language processing + kmeans clustering + dimensionality reduction functionalities
import nltk
import re
import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim # for solid LDA implementation

%matplotlib inline

### Load and deal with issues

In [2]:
# Load the dataset 
scrapping_data = pd.read_csv('data_scraping_V2.csv', sep=',',engine="python")
scrapping_data.head(10)

Unnamed: 0,source,text
0,amazon,Love my S8! Awesome screen and takes great pic...
1,reddit,"I mean, while I don't think that's an especial..."
2,bestbuy,"It would appear that this ""open box"" like new ..."
3,youtube,During all ur review u just talked about getti...
4,youtube,I really want to trade my 7 plus for the S8 ?
5,twitter,And not sold on iPhone X camera performance in...
6,twitter,܀�܉_܀�܉�܀_܀�܀�܁����܁�܁�܉���܉�܀_܀�܀�܉�܀_܀�܉�_�_...
7,bestbuy,Though iPhones are beginning to feel like the ...
8,bestbuy,I love my Samsung S8 edge. It is sleek and eas...
9,bestbuy,"Fast, smooth as silk internet browsing. Best i..."


### Text Cleaning

In [3]:
# Define a stopwords dictionnary :

stopwords = nltk.corpus.stopwords.words('english')

# We keep the negative adverbs
stopwords.remove('no')
stopwords.remove('not')

print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very',

In [4]:
# We want to keep the negative indicators (e.g. wouldn't --> keep not). 
# So we need to expand common English contractions
# To do so, we use a bit of code from StackOverFlow



# this code is not mine! i shamelessly copied it from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# all credits go to alko and arturomp @ stack overflow.
# basically, it's a big find/replace.

cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text.lower())

# examples
print (expandContractions('Don\'t you get it?'))
print (expandContractions('I ain\'t got time for y\'alls foolishness'))
print (expandContractions('You won\'t live to see tomorrow.'))
print (expandContractions('You\'ve got serious cojones coming in here like that.'))
print (expandContractions('I hadn\'t\'ve enough'))

do not you get it?
i am not got time for you alls foolishness
you will not live to see tomorrow.
you have got serious cojones coming in here like that.
i had not've enough


In [5]:
def preprocessing(string):
    string = str(string)
    # lower_case
    string = string.lower()
    # remove accents
    string = unidecode.unidecode(string)
    # expand English contractions
    string = expandContractions(string)
    # remove stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
    string = pattern.sub('', string)
    # remove iphone
    pattern = re.compile(r'iphone')
    string = pattern.sub('', string)
    # remove apple
    pattern = re.compile(r'apple')
    string = pattern.sub('', string)
    # remove samsung
    pattern = re.compile(r'samsung')
    string = pattern.sub('', string)
    # remove galaxy
    pattern = re.compile(r'galaxy')
    string = pattern.sub('', string)
    # remove \n
    string = string.replace('\n', ' ')
    # remove special caracters like "" and punctuation
    string = re.sub('[^A-Za-z0-9 ]','', string)
    # lematize
    string = nltk.stem.wordnet.WordNetLemmatizer().lemmatize(string,"v")
    string = nltk.stem.wordnet.WordNetLemmatizer().lemmatize(string,"a")
    string = nltk.stem.wordnet.WordNetLemmatizer().lemmatize(string)
    return(string)

In [6]:
### Some sanity check

print("************* A regular example *************")
print("Before: "+scrapping_data.iloc[0].text)
print('----------------------------------------------')
print("After: "+preprocessing(scrapping_data.iloc[0].text))
print("\n")

print("************* An encoding error example *************")
print("Before: "+scrapping_data.iloc[6].text)
print('----------------------------------------------')
print("After: "+preprocessing(scrapping_data.iloc[6].text))

************* A regular example *************
Before: Love my S8! Awesome screen and takes great pictures!; Love my S8! Awesome screen and takes great pictures!
----------------------------------------------
After: love s8 awesome screen takes great pictures love s8 awesome screen takes great pictures


************* An encoding error example *************
Before: ܀�܉_܀�܉�܀_܀�܀�܁����܁�܁�܉���܉�܀_܀�܀�܉�܀_܀�܉�_�_܉�܉�܉���_��¬�_�__�܁����܉�܁�܁�܉�GALAXY�__�_�܉�_�_܉�܉�܉�܁���
܉�܀_܀�܀�܉�܀_܀�܁��_�___�_�_��_�����܁_܀�܉_܀_܀�΍�����ˇ܁�
----------------------------------------------
After:  v


### Compute TFIDF

In [7]:
TFIDF_mono = TfidfVectorizer(
      input='content',
      lowercase=False,
      preprocessor=preprocessing)

# Compute the TFIDF matrix (+create a dictionnary ...)
tfidf = TFIDF_mono.fit_transform(scrapping_data.text)

In [177]:
np.save('TDFIDF_matrix.npy', tfidf)
tfidf

<46806x40690 sparse matrix of type '<class 'numpy.float64'>'
	with 603545 stored elements in Compressed Sparse Row format>

In [178]:
# Have acces to the vocabulary mapping : words >>> numbers

### Uncomment the following line
# TFIDF_mono.vocabulary_

In [179]:
# Have acces to the vocabulary mapping : numbers >>> words

def getWord(search_column) :
    for word, value in TFIDF_mono.vocabulary_.items():
        if value == search_column:
            search_word = word
            break
    return(search_word)
print(getWord(10000))

getWords = np.vectorize(getWord)
print(getWords([10000,10001,10002]))

dailyreal
['dailyreal' 'dair' 'dak']


### Dimensionnality reduction

#### Non-negative matrix factorization (NMF)

<img src="https://upload.wikimedia.org/wikipedia/commons/f/f9/NMF.png"/ width="500">

In [184]:
model = NMF(n_components=20, init='random') 
W = model.fit_transform(tfidf) 
H = model.components_

In [185]:
print(np.shape(scrapping_data))
print(np.shape(tfidf))
print(np.shape(W))
print(np.shape(H))

(46806, 2)
(46806, 40690)
(46806, 20)
(20, 40690)


In [186]:
def nArgMax(array,n):
    arraybis = array.copy()
    arg = []
    for i in range(n):
        argmaxi = np.argmax(arraybis)
        arg += [argmaxi]
        arraybis[argmaxi] = -np.Inf
    return(arg)

In [187]:
topic_words=[]
for n_topic in range(np.shape(W)[1]):
    topic_words += [getWords(nArgMax(H[n_topic],10))]
    
    print("******************** topic n°"+str(n_topic)+"********************")
    print(topic_words[n_topic])
    print("\n")

******************** topic n°0********************
['phone' 'loves' 'everything' 'got' 'bought' 'happy' 'old' 'fast' 'far'
 'perfect']


******************** topic n°1********************
['face' 'get' 'button' 'id' 'home' 'used' 'works' 'getting' 'recognition'
 'touch']


******************** topic n°2********************
['s8' 's7' 'note' 'watching' 'android' 'oreo' 'edge' 's6' 'plus' 'got']


******************** topic n°3********************
['best' 'ever' 'buy' 'far' 'yet' 'one' 'owned' 'market' 'phones' 'made']


******************** topic n°4********************
['screen' 'plus' 'size' 'bigger' 'smaller' 'big' 'larger' 'perfect' '6s'
 'edge']


******************** topic n°5********************
['no' 'one' 'issues' 'far' 'complaints' 'problem' 'button' 'problems'
 'home' 'got']


******************** topic n°6********************
['battery' 'life' 'long' 'fast' 'day' 'longer' 'lasts' 'last' 'excellent'
 'charge']


******************** topic n°7********************
['problems' '

#### Latent Dirichlet allocation

In [8]:
from gensim import models, corpora
from gensim.utils import simple_preprocess
from gensim.models import Phrases

In [9]:
def tokenize(text):
    return [token for token in preprocessing(text).split() if token not in stopwords]

#sanity check
print("text: "+scrapping_data.iloc[10000].text )
print("tokens: "+str(tokenize(scrapping_data.iloc[10000].text)))

text: Lesson learned. Don't rush buying things because maybe some of them might have issues. Spend wisely tho. ?
tokens: ['lesson', 'learned', 'not', 'rush', 'buying', 'things', 'maybe', 'might', 'issues', 'spend', 'wisely', 'tho']


In [10]:
list_tokens_all = [tokenize(description) for description in scrapping_data.text.tolist()]

In [11]:
# Use bigram and trigram to catch combination of 2/3 words that have a specific meaning together
bigram = Phrases(list_tokens_all, min_count=5)
trigram = Phrases(bigram[list_tokens_all], min_count=3)



In [12]:
tokens = list(trigram[bigram[list_tokens_all]])



In [13]:
id2word_phones = corpora.Dictionary(tokens)
print(id2word_phones)
print(id2word_phones.get(1))
print(id2word_phones.get(2))

Dictionary(47074 unique tokens: ['awesome', 'love', 'pictures', 's8', 'screen']...)
love
pictures


In [14]:
# ignore words that appear in less than 5 documents or more than 30% documents
id2word_phones.filter_extremes(no_below=5, no_above=0.3)
print(id2word_phones)

Dictionary(11144 unique tokens: ['awesome', 'love', 'pictures', 's8', 'screen']...)


In [15]:
corpus = [id2word_phones.doc2bow(tok) for tok in tokens]

In [16]:
print(tokens[0])
print(corpus[0])

['love', 's8', 'awesome', 'screen', 'takes_great', 'pictures', 'love', 's8', 'awesome', 'screen', 'takes_great', 'pictures']
[(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, 2)]


In [21]:
# Run the LDA (computation time should be between 5 to 60 seconds)

# choose the number of topics => to find a "good" number of topics, try multiple values and see which one is the best
# optionally: input alpha and eta to influence how topics are distributed across documents, 
#  and how words are distributed across topics
#  the syntax is the following
#  alpha is a vector of size the number of documents, and eta's size is the number of words

# alpha = [0.01] * id2word_phones.num_docs
# eta = [0.01] * len(id2word_phones.keys())

num_topics = 20

# Below without alpha nor eta
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=id2word_phones, passes=2)

# Below with alpha and eta
# lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=id2word_phones, passes=4, 
                            #alpha=[0.01] * id2word_phones.num_docs, eta = [0.01] * len(id2word_phones.keys()))

In [23]:
num_documents = 2 # increase this number to check how topics are distributed across more than the first 2 documents
for i in range(num_documents):
    print(lda_model[corpus[i]]) # what proportion of topics in each document

[(6, 0.21432479), (13, 0.71644443)]
[(3, 0.09925251), (5, 0.28295398), (6, 0.07169358), (7, 0.28365943), (14, 0.21244048)]


In [24]:
lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=True)

[(0,
  '0.032*"s8" + 0.029*"10" + 0.020*"8_plus" + 0.016*"4" + 0.015*"yeah" + 0.015*"de" + 0.014*"se" + 0.014*"would_recommend" + 0.013*"no" + 0.013*"x"'),
 (1,
  '0.071*"buy" + 0.018*"got" + 0.018*"mobile" + 0.016*"one" + 0.015*"loop_three_new_revealed" + 0.015*"version" + 0.013*"incredible" + 0.013*"serious_problems_ios_update" + 0.012*"black" + 0.012*"fixed"'),
 (2,
  '0.033*"video" + 0.023*"always" + 0.014*"s8" + 0.014*"years" + 0.013*"cameras" + 0.012*"phones" + 0.012*"vs" + 0.011*"user" + 0.011*"future" + 0.010*"android"'),
 (3,
  '0.046*"not" + 0.022*"face_id" + 0.014*"notch" + 0.012*"screen" + 0.011*"touch_id" + 0.011*"lol" + 0.011*"even" + 0.010*"would" + 0.009*"think" + 0.009*"agree"'),
 (4,
  '0.034*"perfect" + 0.027*"s8" + 0.027*"phone" + 0.018*"please" + 0.016*"display" + 0.013*"wait" + 0.012*"came" + 0.011*"best" + 0.011*"not" + 0.011*"64gb"'),
 (5,
  '0.056*"not" + 0.024*"people" + 0.015*"note_8" + 0.015*"buy" + 0.015*"would" + 0.012*"one" + 0.012*"know" + 0.012*"want" +