# Where to sell products
## Author: Luis Eduardo Ferro Diez, <a href="mailto:luis.ferro1@correo.icesi.edu.co">luis.ferro1@correo.icesi.edu.co</a>

This notebook contains the main work and contribution of the project, exploiting geo tagged social network data to predict where to sell specific products depending on the content being posted by the users.


In [1]:
import pandas as pd

tweets_path = "../../datasets/tweets_parquet"
tweets = pd.read_parquet(tweets_path, engine="pyarrow")
tweets.head()

Unnamed: 0,id,tweet,lang,favorite_count,retweet_count,is_retweet,user_id,user_name,user_followers_count,user_following_count,...,place_full_name,country,country_code,place_type,place_url,is_spam,year,month,day,hour
0,374048987046637568,@fizziero ngareb beud! !,id,0.0,0.0,0.0,389276837,syarifsidi,47.0,63.0,...,"Jatinegara, Jakarta Timur",Indonesia,ID,city,https://api.twitter.com/1.1/geo/id/9e0e6d510fb...,0.0,2013,9,1,1
1,374048987046625280,"@shahshahrul11 nak buat acano,keto den lg pent...",id,0.0,0.0,0.0,184869610,AliffSadali,194.0,247.0,...,"Keratong, Rompin",Malaysia,MY,city,https://api.twitter.com/1.1/geo/id/1deede127b2...,0.0,2013,9,1,1
2,374048987034419200,@adambeyer234: I already miss Jace like hell,en,0.0,0.0,0.0,363516745,rilez_sharp,582.0,666.0,...,"New York, US",United States,US,admin,https://api.twitter.com/1.1/geo/id/94965b2c453...,0.0,2013,9,1,1
3,374048987050823680,"""this is us""...nunca me voy a artar de verla.....",es,0.0,0.0,0.0,446125189,LUCRECIALg,13.0,65.0,...,Argentina,Argentina,AR,country,https://api.twitter.com/1.1/geo/id/4d3b316fe2e...,0.0,2013,9,1,1
4,374048991241310208,Aquí en una reunión casual (@ Dhamy's Bar) [pi...,es,0.0,0.0,0.0,44793849,charal3x,66.0,77.0,...,"Veracruz, Veracruz de Ignacio de la Llave",México,MX,city,https://api.twitter.com/1.1/geo/id/6c67fe933a6...,0.0,2013,9,1,1


In [2]:
tweets.columns

Index(['id', 'tweet', 'lang', 'favorite_count', 'retweet_count', 'is_retweet',
       'user_id', 'user_name', 'user_followers_count', 'user_following_count',
       'user_location', 'created_timestamp', 'hashtags', 'user_mentions',
       'user_id_mentions', 'expanded_urls', 'location_geometry',
       'place_geometry', 'place_id', 'place_name', 'place_full_name',
       'country', 'country_code', 'place_type', 'place_url', 'is_spam', 'year',
       'month', 'day', 'hour'],
      dtype='object')

Let's visualize the tweets on a map.

First, we will create a geopandas dataframe from the original data.

In [3]:
from shapely import wkt
import geopandas as gpd

# Some tweets might have appear without place or location geometry
tweets = tweets[(tweets.place_geometry.notnull()) | (tweets.location_geometry.notnull())]

def parse_geometry(geom):
    if geom:
        return wkt.loads(geom)
    else:
        return None

tweets.location_geometry = tweets.location_geometry.apply(parse_geometry)
tweets.place_geometry = tweets.place_geometry.apply(parse_geometry)

# Let's work first with the location geometry first
geo_tweets = gpd.GeoDataFrame(tweets, geometry='location_geometry')
geo_tweets.head()

Unnamed: 0,id,tweet,lang,favorite_count,retweet_count,is_retweet,user_id,user_name,user_followers_count,user_following_count,...,place_full_name,country,country_code,place_type,place_url,is_spam,year,month,day,hour
0,374048987046637568,@fizziero ngareb beud! !,id,0.0,0.0,0.0,389276837,syarifsidi,47.0,63.0,...,"Jatinegara, Jakarta Timur",Indonesia,ID,city,https://api.twitter.com/1.1/geo/id/9e0e6d510fb...,0.0,2013,9,1,1
1,374048987046625280,"@shahshahrul11 nak buat acano,keto den lg pent...",id,0.0,0.0,0.0,184869610,AliffSadali,194.0,247.0,...,"Keratong, Rompin",Malaysia,MY,city,https://api.twitter.com/1.1/geo/id/1deede127b2...,0.0,2013,9,1,1
2,374048987034419200,@adambeyer234: I already miss Jace like hell,en,0.0,0.0,0.0,363516745,rilez_sharp,582.0,666.0,...,"New York, US",United States,US,admin,https://api.twitter.com/1.1/geo/id/94965b2c453...,0.0,2013,9,1,1
4,374048991241310208,Aquí en una reunión casual (@ Dhamy's Bar) [pi...,es,0.0,0.0,0.0,44793849,charal3x,66.0,77.0,...,"Veracruz, Veracruz de Ignacio de la Llave",México,MX,city,https://api.twitter.com/1.1/geo/id/6c67fe933a6...,0.0,2013,9,1,1
5,374048991224160256,I really really hate texting unless we're talk...,en,0.0,0.0,0.0,542867684,ivonne_xoxo,367.0,310.0,...,"Chicago, IL",United States,US,city,https://api.twitter.com/1.1/geo/id/1d9a5370a35...,0.0,2013,9,1,1


Now let's visualize this information on a map

In [4]:
import ipyleaflet as ipy

gtdf = geo_tweets[["id", "location_geometry"]]
geo_data = ipy.GeoData(geo_dataframe=gtdf,
                       style={'properties': {'marker-size': 'small'}},
                       icon="https://upload.wikimedia.org/wikipedia/commons/thumb/3/35/Location_dot_blue.svg/64px-Location_dot_blue.svg.png",
                       name="Geo Tweets")

m = ipy.Map(center=(52.3, 8.0), 
            zoom=3,
            scroll_wheel_zoom=True,
            basemap=ipy.basemaps.Esri.WorldTopoMap)

m.add_layer(geo_data)
m.add_control(ipy.LayersControl())
m

Map(basemap={'url': 'http://server.arcgisonline.com/ArcGIS/rest/services/World_Topo_Map/MapServer/tile/{z}/{y}…

The idea is to detect clusters among these geo-tagged tweets, then perfomr a LDA topic detection and finally measure the relevance of each topic against a product or service to characterize each geographic cluster as per the product relationship.

For this, we need to first compute the clusters. We are going to use DBSCAN since one of it's properties is that it does not depend on a central tendency measurement and it is not constrained by the shape of the clusters.

Since we just want to detect clusters based on the geographic position, we just need the geometry.

In [5]:
import numpy as np

points = geo_tweets[geo_tweets.location_geometry.notnull()].location_geometry.apply(lambda p: [p.x, p.y])
points = np.array(points.values.tolist())
points.shape

(171, 2)

In [6]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

# Since this is world data and data can be sparse, we use a small eps to find the clusters
db = DBSCAN(eps=0.05, min_samples=10, metric="cosine").fit(points)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
#print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
#print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
#print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
#print("Adjusted Rand Index: %0.3f"
#      % metrics.adjusted_rand_score(labels_true, labels))
#print("Adjusted Mutual Information: %0.3f"
#      % metrics.adjusted_mutual_info_score(labels_true, labels,
#                                           average_method='arithmetic'))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(points, labels))

Estimated number of clusters: 3
Estimated number of noise points: 6
Silhouette Coefficient: 0.768


Now, let's visualize the clusters in the map.

In [13]:
loc_tweets = geo_tweets[geo_tweets.location_geometry.notnull()]
loc_tweets["cluster"] = labels

colors = {0: "#FF0000", 1: "#0033FF", 2: "#00BB33", -1: "#000000"}

def create_marker(row):
    lat_lon = (row["location_geometry"].y, row["location_geometry"].x)
    cluster = row["cluster"]
    color = colors[cluster]
    return ipy.CircleMarker(location=lat_lon,
                           draggable=False,
                           fill_color=color,
                           fill_opacity=0.5,
                           radius=3,
                           stroke=False)
    
markers = loc_tweets.apply(create_marker, axis=1)
layer_group = ipy.LayerGroup(layers=tuple(markers.values))
m = ipy.Map(center=(52.3, 8.0), 
            zoom=3,
            scroll_wheel_zoom=True,
            basemap=ipy.basemaps.Esri.WorldTopoMap)
m.add_layer(layer_group)
m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Map(basemap={'url': 'http://server.arcgisonline.com/ArcGIS/rest/services/World_Topo_Map/MapServer/tile/{z}/{y}…

Now, we need to perform topic modeling on each cluster. Since every single tweet is too short in content, the aggregated tweets of a cluster will be used as text corpora for the topic modeling analysis.

In [33]:
cluster_0 = loc_tweets[loc_tweets.cluster == 0]["tweet"].reset_index()
cluster_1 = loc_tweets[loc_tweets.cluster == 1]["tweet"].reset_index()
cluster_2 = loc_tweets[loc_tweets.cluster == 2]["tweet"].reset_index()
cluster_x = loc_tweets[loc_tweets.cluster == -1]["tweet"].reset_index()

In [34]:
cluster_0.head()

Unnamed: 0,index,tweet
0,0,@fizziero ngareb beud! !
1,1,"@shahshahrul11 nak buat acano,keto den lg pent..."
2,7,"""@3gerardpique: Congratulations to Bayern Münc..."
3,9,Just Give Me a Reason (feat. Nate Ruess) by P!...
4,11,Lahh ngapa gee ? Wkwk RT @nadyaleha: Widih -_-...


Data pre-processing

In [107]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

np.random.seed(1234)
stemmer = PorterStemmer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos="n")) # Part of Speech NOUN

def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [108]:
doc_sample = cluster_1.iloc[0]["tweet"]
print(f"Original document: \n {doc_sample}\n\n")
print(f"Tokenized & lemmatized document: \n {preprocess(doc_sample)}")

Original document: 
 @adambeyer234: I already miss Jace like hell


Tokenized & lemmatized document: 
 ['adambey', 'miss', 'jace', 'like', 'hell']


For our purpose, all the tweets of a cluster makes a single document, hence before applying the preprocess function, we need to aggregate all the clusters into single row.

In [109]:
import pandas as pd

test = ', '.join(cluster_1.tweet.values)
corpus = {'cluster': [0, 1, 2, -1],
         'corpus': [', '.join(cluster_0.tweet.values), ', '.join(cluster_1.tweet.values), ', '.join(cluster_2.tweet.values), ', '.join(cluster_x.tweet.values)]}
corpus_df = pd.DataFrame.from_dict(corpus)
corpus_df.head()

Unnamed: 0,cluster,corpus
0,0,"@fizziero ngareb beud! !, @shahshahrul11 nak b..."
1,1,"@adambeyer234: I already miss Jace like hell, ..."
2,2,"Wind 4.0 mph SSE. Barometer 1040.0 mb, Falling..."
3,-1,"A mi amiga le gusta el koyac, Amanha e anivers..."


In [110]:
preprocessed_docs = corpus_df.corpus.apply(preprocess)
preprocessed_docs.head(10)

0    [fizziero, ngareb, beud, shahshahrul, buat, ac...
1    [adambey, miss, jace, like, hell, aquí, reunió...
2    [wind, baromet, fall, slowli, temperatur, rain...
3    [amiga, gusta, koyac, amanha, aniversario, min...
Name: corpus, dtype: object

Now we need to create the dictionary

In [111]:
from gensim.corpora import Dictionary

dictionary = Dictionary(preprocessed_docs)
words = [(k, v) for k, v in dictionary.items()]
words[:10]

[(0, 'acano'),
 (1, 'adiik'),
 (2, 'afternoon'),
 (3, 'airport'),
 (4, 'ais_jbr'),
 (5, 'aish'),
 (6, 'akihabara'),
 (7, 'alasan'),
 (8, 'alay'),
 (9, 'aleh')]

Now let's filter some terms according to word occurence

In [112]:
dictionary.filter_extremes(no_below=1, keep_n=100000)

Now for each document we will create a Bag of Words

In [113]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]
bow_corpus[0][:10]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1)]

In [114]:
bow_corpus_0 = bow_corpus[0]
for i in range(len(bow_corpus_0[:10])):
    word = bow_corpus_0[i][0]
    count = bow_corpus_0[i][1]
    print(f"Word {word} ('{dictionary[word]}') appears {count} times.")



Word 0 ('acano') appears 1 times.
Word 1 ('adiik') appears 1 times.
Word 2 ('afternoon') appears 1 times.
Word 3 ('airport') appears 1 times.
Word 4 ('ais_jbr') appears 1 times.
Word 5 ('aish') appears 1 times.
Word 6 ('akihabara') appears 1 times.
Word 7 ('alasan') appears 1 times.
Word 8 ('alay') appears 1 times.
Word 9 ('aleh') appears 1 times.


As an alternative, we can create a TF-IDF model fromthe bow corpus

In [115]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

corpus_tfidf[0][:10]

[(0, 0.04538427298976458),
 (1, 0.04538427298976458),
 (2, 0.04538427298976458),
 (3, 0.04538427298976458),
 (4, 0.04538427298976458),
 (5, 0.04538427298976458),
 (6, 0.04538427298976458),
 (7, 0.04538427298976458),
 (8, 0.04538427298976458),
 (9, 0.04538427298976458)]

Running LDA with BoW

In [116]:
from gensim.models import LdaMulticore

lda_model = LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)


In [117]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx} \nWords: {topic}")

Topic 0 
Words: 0.015*"lagi" + 0.005*"randik_akew" + 0.005*"buat" + 0.004*"siang" + 0.004*"jadi" + 0.004*"wkwk" + 0.004*"udah" + 0.004*"tahun" + 0.004*"kenapa" + 0.004*"siap"
Topic 1 
Words: 0.004*"lagi" + 0.002*"test" + 0.002*"tapi" + 0.002*"siang" + 0.002*"semua" + 0.002*"school" + 0.002*"jadi" + 0.002*"wkwk" + 0.002*"galau" + 0.002*"home"
Topic 2 
Words: 0.013*"text" + 0.010*"hoe" + 0.010*"home" + 0.009*"hard" + 0.009*"work" + 0.009*"come" + 0.007*"talk" + 0.007*"aint" + 0.007*"blast" + 0.007*"like"
Topic 3 
Words: 0.003*"today" + 0.003*"text" + 0.003*"come" + 0.003*"удачи" + 0.003*"humid" + 0.003*"prender" + 0.003*"work" + 0.003*"fall" + 0.003*"rain" + 0.003*"worth"
Topic 4 
Words: 0.018*"lagi" + 0.007*"jadi" + 0.005*"famou" + 0.005*"good" + 0.005*"kasian" + 0.005*"tapi" + 0.005*"semua" + 0.005*"dessert" + 0.005*"school" + 0.004*"test"
Topic 5 
Words: 0.011*"asikaqedi" + 0.011*"mahunnyb" + 0.011*"gusta" + 0.011*"minha" + 0.011*"koyac" + 0.011*"sogra" + 0.011*"joda" + 0.011*"deprimi

In [118]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
