# Pinterest API Data

In [None]:
import requests
### Pinterest API connection with paging
def pinterest_paging(url = '', params = ''):
    data = []
    
    response = requests.get(url = url, params = params)
    response = response.json()
    data.extend(response['data'])
    
    while response['page']['next'] != None:
        response = requests.get(url = response['page']['next'])
        response = response.json()
        data.extend(response['data'])
        
    return data

In [None]:
import pandas as pd
from pandas.io.json import json_normalize

access_token = 'YOUR_ACCESS_TOKEN'
fields = 'id, board, note, counts'
pin_data = pinterest_paging(url = 'https://api.pinterest.com/v1/me/pins/',
params = {'access_token': access_token, 'fields': fields})
pins = pd.io.json.json_normalize(pin_data)

In [None]:
pins.groupby('board.name')['board.name'].count()

# Pinterest API Crawling

In [None]:
import sys
from bs4 import BeautifulSoup
import urllib2
import time
from selenium import webdriver

import pprint

reload(sys)  # Reload does the trick!
sys.setdefaultencoding('UTF8')

# Gets the specific filters available on the Pinterest page
def get_subtopics(soup):
    guideText = soup.findAll('span', {"class":"guideText"})

    sub_topics = []
    for guide in guideText:
        sub_topics.append(str(guide.contents[0]))

    print sub_topics

#Gets the pin attributes for a given query
def get_pins(soup,query_term):
    #pins = soup.findAll('div', {"class":"_ts _3m _tt"})
    pins = soup.findAll('div', {"class":"pinWrapper"})

    all_pins = []
    print(len(pins))
    for pin in pins:
        pinId = str(pin.find('div', {"class":"pinHolder"}).a['href'])
        #re_pins =  str(pin.find('em', {"class":"socialMetaCount repinCountSmall"}).getText().strip())
        #likes = str(pin.find('em', {"class":"socialMetaCount likeCountSmall"}).contents[0].strip())

        pin_title = ""

        if(pin.find('h3', {"class":"richPinGridTitle"})!=None):
            pin_title = str(pin.find('h3', {"class":"richPinGridTitle"}).getText())

        pin_description = ""
        if (pin.find('p',{'class':'pinDescription'})!=None):
            pin_description = str(pin.find('p',{'class':'pinDescription'}).getText().encode('utf-8').strip())

        pin_user = ""
        if (pin.find('div',{'class':'creditName'})!=None):
            pin_user = str(pin.find('div',{'class':'creditName'}).getText().encode('utf-8').strip())

        #print(pin_user)
        pin_img = str(pin.find('div', {"class":"pinHolder"}).img['src'])
        #break;

        all_pins.append((pinId,pin_title.strip(),pin_description.strip(),pin_user,pin_img,query_term))

    return(all_pins)

        #print pinId + '\t' + re_pins + '\t' + '\t' + pin_title + '\t' #+ pin_description + '\t' #+ pin_img+'\t'+query_term

def query_pinterest(query_term='forever21'):
    
    url = 'https://fr.pinterest.com/search/pins/?q='
    query_term = query_term.replace(' ','+')
    url = url + query_term
    #print url
    options = webdriver.ChromeOptions()
    options.add_argument("--kiosk")
    driver = webdriver.Chrome(chrome_options=options,executable_path="/users/michal/py_workspace/book/pinterest_handlers/chromedriver")

    #login

    driver.get("https://www.pinterest.com/login/")
    time.sleep(5)
    username = driver.find_element_by_name("id")
    password = driver.find_element_by_name("password")
    username.send_keys("cn@thedatastrategy.com")
    password.send_keys("tds123,")
    login_attempt = driver.find_element_by_xpath("//*[@type='submit']")
    login_attempt.submit()

    all_pins = []

    time.sleep(10)
    driver.get(url)
    time.sleep(10)
    for i in range(0,100):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        html_page = driver.page_source

        soup = BeautifulSoup(html_page)

        all_pins = all_pins + get_pins(soup,query_term)
        time.sleep(4)


    #subtopics = get_subtopics(soup)
    driver.close()
    return(all_pins) #,subtopics)




pins= query_pinterest('fashion')
pins.columns = ['id','title','description','username','img','query_term']

# Bigram extraction

In [None]:
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
import re

def preprocess(text):

    #1)Basic cleaning
    text = text.strip()
    text = re.sub(r'https?:\/\/.*[\r\n]*', '',text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s]',' ',text)
    text = text.lower()

    #2) Tokenize single comment:
    tokens = nltk.word_tokenize(text)


    #3) stopwords removal
    stopwords_list = stopwords.words('english') + stopwords.words('french') + ['ici','les','mu','every','dautres','www','com']
    tokens = [token for token in tokens if token not in stopwords_list]

    return(tokens)


pins['clean'] = pins['note'].apply(lambda x: preprocess(x))


list_documents = pins['clean'].tolist()

bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_documents(list_documents)
bigram_finder.apply_freq_filter(2)

bigrams = bigram_finder.nbest(bigram_measures.raw_freq,20)
freqs = bigram_finder.score_ngrams(bigram_measures.raw_freq)


ngram = list(bigram_finder.ngram_fd.items())
ngram.sort(key=lambda item: item[-1], reverse=True)

frequency = [(" ".join(k), v) for k,v in ngram]

# Build topic graph

In [None]:
import numpy as np
import networkx as nx

columns = [t[0] for t in frequency]

pins['clean_join'] = pins['clean'].apply(lambda x: " ".join(x))

for column in columns:
    pins[column] = pins['clean_join'].str.lower().str.contains(column) #.astype(int)


df_asint = pins[columns].astype(int)
coocc = df_asint.T.dot(df_asint)
coocc = coocc.values

np.fill_diagonal(coocc, 0)

G=nx.from_numpy_matrix(coocc)



import matplotlib.pyplot as plt

from networkx import node_connected_component

connected = list(node_connected_component(G, (len(coocc)-1)))
G = G.subgraph(connected)

pos=nx.fruchterman_reingold_layout(G) # positions for all nodes

# nodes
nx.draw_networkx_nodes(G,pos,node_size=500,alpha=0.3,node_color='blue')

# edges

#undirected graph
connected = list(node_connected_component(G, (len(coocc)-1)))
G = G.subgraph(connected)

nx.draw_networkx_edges(G,pos,width=2,alpha=0.2)

labels = dict([(i,df_asint.columns[i]) for i in connected])
nx.draw_networkx_labels(G,pos,labels,font_size=12)

plt.axis('off')
plt.savefig("topics_graph.png") # save as png
plt.show() # display


# Build user graph

In [None]:
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np
import networkx as nx

pins_scrap = pd.read_pickle("pins.pickle")
pins_scrap.columns = ['id','title','description','username','img','query_term']

pins_scrap['title'] = pins_scrap['title'].apply(lambda x: x.encode('latin1').decode('utf8'))
pins_scrap['description'] = pins_scrap['description'].apply(lambda x: x.encode('latin1').decode('utf8'))
pins_scrap['username'] = pins_scrap['username'].apply(lambda x: x.encode('latin1').decode('utf8'))

pins_scrap['title_desc'] = pins_scrap.apply(lambda x: x['title'] + " " + x['description'],axis=1)

users = pins_scrap.groupby('username')['title_desc'].apply(lambda x: x.sum())
users = pd.DataFrame(users)


####Preprocessing

def preprocess(text):

    #1)Basic cleaning
    text = text.strip()
    text = re.sub(r'https?:\/\/.*[\r\n]*', '',text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s]',' ',text)
    text = text.lower()

    #2) Tokenize single comment:
    tokens = nltk.word_tokenize(text)


    #3) stopwords removal
    stopwords_list = stopwords.words('english') + stopwords.words('french') + ['ici','les','mu','every','dautres','www','com','ã','tã','dã','faã','â','fr','co']
    tokens = [token for token in tokens if token not in stopwords_list]

    return(tokens)



users['clean'] = users['title_desc'].apply(lambda x: preprocess(x))


list_documents = users['clean'].tolist()

bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_documents(list_documents)
bigram_finder.apply_freq_filter(2)

bigrams = bigram_finder.nbest(bigram_measures.raw_freq,20)
freqs = bigram_finder.score_ngrams(bigram_measures.raw_freq)


ngram = list(bigram_finder.ngram_fd.items())
ngram.sort(key=lambda item: item[-1], reverse=True)

frequency = [(" ".join(k), v) for k,v in ngram]
#
#
#
#####Extract features

columns = [t[0] for t in frequency]

#users = users.reset_index()
usernames = users.index.tolist()

users['clean_join'] = users['clean'].apply(lambda x: " ".join(x))
#
for column in columns:
    users[column] = users['clean_join'].str.lower().str.contains(column) #.astype(int)

users = users[columns]
users = users.transpose()

df_asint = users[usernames].astype(int)
coocc = df_asint.T.dot(df_asint)
coocc = coocc.values

np.fill_diagonal(coocc, 0)
#


#####graph


import matplotlib.pyplot as plt

from networkx import node_connected_component

G=nx.from_numpy_matrix(coocc)

connected = list(node_connected_component(G, (len(coocc)-1)))
G = G.subgraph(connected)

pos=nx.fruchterman_reingold_layout(G) # positions for all nodes

# nodes
nx.draw_networkx_nodes(G,pos,node_size=50,alpha=0.3,node_color='blue')

# edges

#undirected graph
connected = list(node_connected_component(G, (len(coocc)-1)))
G = G.subgraph(connected)

nx.draw_networkx_edges(G,pos,width=1,alpha=0.2)

labels = dict([(i,df_asint.columns[i]) for i in connected])
nx.draw_networkx_labels(G,pos,labels,font_size=5)
#
# nx.draw_networkx_edges(G,pos,edgelist=esmall,width=6,alpha=0.5,edge_color='b',style='dashed')

# labels
#nx.draw_networkx_labels(G,pos,font_size=14,font_family='sans-serif')

plt.axis('off')
plt.savefig("users_graph.png") # save as png
plt.show() # display


# Data analysis

In [None]:
LIMIT = 25

def centrality_measures(G,labels):

    centralities = []
    centralities.append(('degree centrality',nx.degree_centrality(G)))
    centralities.append(('closeness centrality',nx.closeness_centrality(G)))
    centralities.append(('betweenness centrality',nx.betweenness_centrality(G)))
    centralities.append(('eigenvector centrality',nx.eigenvector_centrality(G)))

    for centrality in centralities:
        sorted_elements = sorted(centrality[1].items(), key=lambda x: x[1],reverse=True)

        chart_labels = []
        chart_measures = []
        for element in sorted_elements[0:LIMIT]:
            chart_labels.append(labels[element[0]])
            chart_measures.append(element[1])

        ax = plt.subplot()
        ax.barh(range(0,len(chart_labels)), chart_measures, align='center', color='blue', ecolor='black')
        ax.set_yticks(range(0,len(chart_labels)))
        ax.set_yticklabels(chart_labels)
        ax.invert_yaxis()  # labels read top-to-bottom
        ax.set_xlabel(centrality[0])

        plt.show()


def centrality_measures_heatmap(G,labels):

    centralities = []
    centralities.append(nx.degree_centrality(G))
    centralities.append(nx.closeness_centrality(G))
    centralities.append(nx.betweenness_centrality(G))
    centralities.append(nx.eigenvector_centrality(G))

    measures = []
    for node in G.nodes_iter():
        measures.append(list(map(lambda f: f[node], centralities)))
        # print "%s: %s" % (node, measures)

    measures = list(map(list, zip(*measures)))

    #normalize
    norm_measures = []
    for measure in measures:
        norm_measures.append([float(i)/max(measure) for i in measure])

    column_labels = labels.values()
    row_labels = ['degree','closeness','betweenness','eigenvector']
    ax = plt.subplot()
    ax.set_xticks(range(0,len(labels)))
    ax.set_xticklabels(column_labels, minor=False)
    ax.set_yticks([0,1,2,3])
    ax.set_yticklabels(row_labels, minor=False)
    plt.xticks(rotation=90)
    plt.imshow(norm_measures, cmap='hot_r', interpolation='none')
    plt.show()
    #return(measures)


# Community detection

In [None]:
import community
import networkx as nx
import matplotlib.pyplot as plt
import random

#first compute the best partition
partition = community.best_partition(G)

#drawing
size = float(len(set(partition.values())))
pos = nx.spring_layout(G)
count = 0.
for community in set(partition.values()) :
    count = count + 1.
    list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == community]
    nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20,node_color = (count*random.uniform(0, 1)/size,count*random.uniform(0, 1)/size,count*random.uniform(0, 1)/size))

nx.draw_networkx_edges(G,pos, alpha=0.1)
plt.show()