In [None]:
from tqdm import tqdm, tqdm_notebook # progress bars in Jupyter
from time import time # measure the computation time of a python code
import pandas as pd # the most basic & powerful data manipulation tool
import numpy as np # Here, mostly used for np.nan
import langdetect # detect the language of text
import stop_words # handles stop words in many languages without having to rebuild them everytime
import spacy # NLP library for POS tagging
# For spacy use "pip install spacy", then "python -m spacy download en" to download English text mining modules

In [None]:
tqdm.pandas()
tqdm_notebook()

### Read data

In [None]:
df_comments = pd.read_csv('https://raw.githubusercontent.com/SebastianS09/DataCampX/master/group10_applesupport.csv')
df_comments.drop(["Unnamed: 0","date","link","nb_replies","nb_views","user"],axis=1,inplace=True)
df_comments.columns = ["source","text"]

In [None]:
df_comments.source.value_counts().reset_index()

# Clean dataset

1) Keep English (or the language of your choice) <br>
2) Remove empty articles

In [None]:
# langdetect.detect raises errors that are not explained => better to use try/except combination
def detect(text):
    try:
        return langdetect.detect(text)
    except langdetect.detector.LangDetectException:
        return np.nan

In [None]:
df_comments['lang'] = df_comments.text.progress_map(detect)

In [None]:
# See comments with uncommon languages
df_comments.loc[df_comments.lang=='af']
# Looks like the language detection did not work well on these comments...

In [None]:
#all the af are actually english
df_comments.lang[df_comments.lang=='af']="en"


# Out of curiosity, check the languages of the articles
df_comments.groupby('source').lang.value_counts()

In [None]:
df_english = df_comments[df_comments['lang'] == 'en' ].reset_index()
df_english= df_english.replace(r'\n',' ', regex=True)
df_english = df_english.replace('\xa0','', regex = True)
df_english = df_english.replace('  ', ' ', regex = True)
df_english.groupby('source').lang.value_counts()

# Build LDA model

In [None]:
from gensim import models, corpora
from gensim.utils import simple_preprocess
from gensim.models import Phrases

import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 10)

In [None]:
STOPWORDS = stop_words.get_stop_words(language='en')

### Clean data

In [None]:
# This is the syntax of SpaCy (check documentation online for tutorials https://spacy.io/usage/)
nlp = spacy.load('en')

In [None]:
# Use SpaCy nlp() object from the line above, to analyse comments
df_english['nlp_spacy'] = df_english.text.progress_map(lambda comment: nlp(comment.lower()))

In [None]:
# len counts the number of words
df_english['nlp_spacy_len'] = df_english['nlp_spacy'].map(len)

In [None]:
# Keep only nouns in comments, to define better topics
df_english['noun_tokens'] = df_english.nlp_spacy.progress_map(
    lambda n: [w.lemma_ for w in n if w.pos_=='NOUN'])

In [None]:
# Count the number of nouns per comment
df_english['noun_tokens_len'] = df_english['noun_tokens'].map(len)

In [None]:
df_english[['noun_tokens_len']].describe()

In [None]:
# Build model on comments that have at least 8 nouns (abitrary number to keep comments with enough topic content)
df_try = df_english.loc[df_english.noun_tokens_len>=3]

In [None]:
df_try[['noun_tokens_len']].describe()

In [None]:
# Use bigram and trigram to catch combination of 2/3 words that have a specific meaning together
bigram = Phrases(df_try.noun_tokens.tolist(), min_count=3)
trigram = Phrases(bigram[df_try.noun_tokens.tolist()], min_count=3)

In [None]:
df_try.head()

In [None]:
list(trigram[bigram[df_try.noun_tokens.tolist()]])[:10]

In [None]:
#sw = ['iphone', 'phone', '+', 'gb', 'edge', 'one', 'galaxy', 'samsung', 'galaxy_s8', 'plus', 
#      's5', 'series', 'x', 'x.', '5s', '6s', '7', '8+', 'android', 'apple', 'ios', 'iâ\x80\x99m', 
#      's4', 's5', 's6', 's7', 's8', 'bestbuy', 'best_buy', 'wife', 'son', 'daughter']
#no custom stop words for now except iphone
sw = ["iphone"]
tokens = list(trigram[bigram[df_try.noun_tokens.tolist()]])
tokens = [[t for t in tok if t.lower() not in STOPWORDS+sw] for tok in tokens]
tokens =[[t for t in tok if t.lower() not in STOPWORDS+sw] for tok in tokens]

dictionary = corpora.Dictionary(tokens)
print(dictionary)
# ignore words that appear in less than 5 documents or more than 10% documents
dictionary.filter_extremes(no_below=3)
print(dictionary)
corpus = [dictionary.doc2bow(tok) for tok in tokens]

In [None]:
tokens

In [None]:
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel, LsiModel
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim import matutils
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import defaultdict
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import pandas as pd
import numpy as np
import pickle

In [None]:
num_topics = 20

In [None]:
topics = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=4, eta = [0.0001] * len(dictionary.keys()))
vis_data = gensimvis.prepare(topics, corpus, dictionary)
pyLDAvis.display(vis_data)

In [None]:
topics.show_topics(num_topics=num_topics, num_words=10, formatted=True)

In [None]:
# For each article, print the beginning of the article, the topic allocation for this document, 
# and the 5 main keywords of the first 2 topics of this document

import math # for rounding
topics_names = topics.show_topics(num_topics=num_topics, num_words=5, formatted=True)

In [None]:
for i,article in enumerate(df_try.text.tolist()):
    topics_in_article = topics[corpus[i]]
    topics_in_article = [(topic[0], math.ceil(topic[1]*100)/100) for topic in topics_in_article] # rounded, for better printing
    key_words_topic = []
    
    for topic in range(min(len(topics_in_article), 2)):
        id_topic = topics_in_article[topic][0] # id of the topic
        key_words_topic.append(topics_names[id_topic][1])
    
    print("article #"+str(i)+": "+article[:400].replace('\n','') + '\n' + str(topics_in_article) + '\n' +
          '\n'.join(["keywords topic: "+kw for kw in key_words_topic]))
    print()
    if i>10:
        break

In [None]:
print(topics.alpha)
print(topics.eta)

# Graph of words to detect problems

Graph of words use words that are neighbors in sentences. <br>For instance, in the sentence "Graph of words use words that are neighbors in sentences", the table below describes the neighbors

| word_1 | word_2 |
|-|-|
| Graph | of |
| of | words |
| words | use |
| use | words |
| words | that |
| that | are |
| are | neighbors |
| neighbors | in |
| in | sentences |

We'll handle different things as well:
- keeping only nouns
- using words that are the 2nd neighbors (neighbor of neighbor)
- 

In [None]:
import networkx as nx # to analyse graphs in Python

In [None]:
# We aggregate data from ALL the comments (in our cleaned dataframe)
# And take the words (tokens) that are nouns
clean_text = df_try.noun_tokens.tolist()

# The functions below will help us build the dataframe of words that are neighbors
def clean_stop_words_in_dataframe(df, stop_words):
    idx_1 = df.loc[df[df.columns[0]].isin(stop_words)].index
    idx_2 = df.loc[df[df.columns[1]].isin(stop_words)].index
    return df.loc[~(df.index.isin(idx_1.append(idx_2)))]

def word_neighbors(dist):
    return clean_stop_words_in_dataframe(
        pd.concat([pd.DataFrame([clean_sentence[:-dist], clean_sentence[dist:]]).T for clean_sentence in clean_text]) \
        .rename(columns={0:'w0', 1:'w1'}).reset_index(drop=True), stop_words=STOPWORDS)

In [None]:
clean_text[0] # nouns of the first comment

In [None]:
# This creates a huge table of all the words that are neighbors and 2nd-order neighbors
# For neighbors we use weight = 2, for 2nd-order neighbors we use weight = 1
data_graph_of_words = word_neighbors(1).assign(weight=2).append(word_neighbors(2).assign(weight=1))

In [None]:
data_graph_of_words.head()

In [None]:
# We sum the weights for all combinations of neighbors
data_graph_of_words = data_graph_of_words.groupby(['w0', 'w1']).weight.sum().reset_index()

In [None]:
nx.__version__
# If you have previous versions, the function might be nx.from_pandas_dataframe()
graph_of_words = nx.from_pandas_edgelist(data_graph_of_words, source='w0', target='w1', edge_attr='weight',create_using=nx.Graph())

In [None]:
# We select the words that are neighbors (and 2nd-order neighbors) of the word "problem"
graph_of_words_center = nx.ego_graph(graph_of_words, n='problem', radius=1)
print(graph_of_words_center.size())
print(len(graph_of_words_center))
graph_of_words_center.nodes()

In [None]:
# Which words are the most connected to "problem"?
# Degree is the weight
pd.DataFrame.from_dict([dict(graph_of_words_center.degree(graph_of_words_center.nodes(), weight='weight'))]).T.rename(columns={0:'degree'}).reset_index().rename(columns={'index':'word'}).sort_values('degree', ascending=False)

In [None]:
# Draw the graph as it is
nx.draw(graph_of_words_center, node_size=20)
# It doesn't give us a lot of information, except that many words connected to "problem" are connected together
# (there's more than one line for each red dot)

In [None]:
# We can use PageRank algorithm to see if some words are more connected to others
pagerank = pd.DataFrame.from_dict([nx.pagerank(G=graph_of_words, alpha=0.99)]).T.rename(columns={0:'pagerank'})

In [None]:
# It confirms what we had with LDA: "phone", "screen", "iphone"... are connected to too many words
pagerank.sort_values('pagerank', ascending=False)

In [None]:
# Let's group words into communities, and see if it makes sense in terms of topics
# The code is taken from the link below
# https://stackoverflow.com/questions/43541376/how-to-draw-communities-with-networkx
def community_layout(g, partition):
    """
    Compute the layout for a modular graph.


    Arguments:
    ----------
    g -- networkx.Graph or networkx.DiGraph instance
        graph to plot

    partition -- dict mapping int node -> int community
        graph partitions


    Returns:
    --------
    pos -- dict mapping int node -> (float x, float y)
        node positions

    """

    pos_communities = _position_communities(g, partition, scale=3.)

    pos_nodes = _position_nodes(g, partition, scale=1.)

    # combine positions
    pos = dict()
    for node in g.nodes():
        pos[node] = pos_communities[node] + pos_nodes[node]

    return pos

def _position_communities(g, partition, **kwargs):

    # create a weighted graph, in which each node corresponds to a community,
    # and each edge weight to the number of edges between communities
    between_community_edges = _find_between_community_edges(g, partition)

    communities = set(partition.values())
    hypergraph = nx.DiGraph()
    hypergraph.add_nodes_from(communities)
    for (ci, cj), edges in between_community_edges.items():
        hypergraph.add_edge(ci, cj, weight=len(edges))

    # find layout for communities
    pos_communities = nx.spring_layout(hypergraph, **kwargs)

    # set node positions to position of community
    pos = dict()
    for node, community in partition.items():
        pos[node] = pos_communities[community]

    return pos

def _find_between_community_edges(g, partition):

    edges = dict()

    for (ni, nj) in g.edges():
        ci = partition[ni]
        cj = partition[nj]

        if ci != cj:
            try:
                edges[(ci, cj)] += [(ni, nj)]
            except KeyError:
                edges[(ci, cj)] = [(ni, nj)]

    return edges

def _position_nodes(g, partition, **kwargs):
    """
    Positions nodes within communities.
    """

    communities = dict()
    for node, community in partition.items():
        try:
            communities[community] += [node]
        except KeyError:
            communities[community] = [node]

    pos = dict()
    for ci, nodes in communities.items():
        subgraph = g.subgraph(nodes)
        pos_subgraph = nx.spring_layout(subgraph, **kwargs)
        pos.update(pos_subgraph)

    return pos

In [None]:
# to install networkx 2.0 compatible version of python-louvain use:
# pip install -U git+https://github.com/taynaud/python-louvain.git@networkx2
from community import community_louvain


In [None]:
# Communities around the word "problem"
# To save picture, right click on the picture and select "Save image as..."
matplotlib.rcParams['figure.figsize'] = (40, 40)
G=nx.ego_graph(G=graph_of_words, radius=1, n='problem')
partition = community_louvain.best_partition(G)
pos = community_layout(g=G, partition=partition)
nx.draw(G, pos, node_color=list(partition.values()), 
        labels=dict((n,n) for n,d in G.nodes(data=True)), font_color='black', font_size=16, font_weight='bold',
       edge_color='lightgray')

In [None]:
pd.DataFrame(list(G['problem'].items())).rename(columns={0:'word', 1:'weight_attr'}) \
    .assign(weight = lambda df: df.weight_attr.map(lambda cell: cell['weight'])) \
    .drop(['weight_attr'], axis=1) \
    .sort_values('weight', ascending=False)

In [None]:
# Around the word "issue"
G=nx.ego_graph(G=graph_of_words, radius=1, n='issue')
partition = community_louvain.best_partition(G)
pos = community_layout(g=G, partition=partition)
matplotlib.rcParams['figure.figsize'] = (40, 40)
nx.draw(G, pos, node_color=list(partition.values()), 
        labels=dict((n,n) for n,d in G.nodes(data=True)), font_color='black', font_size=16, font_weight='bold',
       edge_color='lightgray')

In [None]:
# Around the combination of each 2 words
G=nx.compose_all([nx.ego_graph(G=graph_of_words, radius=1, n='issue'), 
                 nx.ego_graph(G=graph_of_words, radius=1, n='problem')])
partition = community_louvain.best_partition(G)
pos = community_layout(g=G, partition=partition)
matplotlib.rcParams['figure.figsize'] = (40, 40)
nx.draw(G, pos, node_color=list(partition.values()), 
        labels=dict((n,n) for n,d in G.nodes(data=True)), font_color='black', font_size=16, font_weight='bold',
       edge_color='lightgray')