# Dask for NLP

In [1]:
import dask.dataframe as dd
import coiled
import pandas as pd
import dask.bag as db

In [2]:
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

## 1. Spin up Cluster

In [3]:
cluster = coiled.Cluster(
    name="dask-nlp",
    software="dask-nlp",
    n_workers=50,
    worker_cpu=8,
    worker_memory="24Gib",
    backend_options={'spot':'True'}
)

Output()

In [4]:
from distributed import Client
client = Client(cluster)
client


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| msgpack | 1.0.3          | 1.0.2         | 1.0.2         |
| python  | 3.9.10.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://3.224.127.214:8787,

0,1
Dashboard: http://3.224.127.214:8787,Workers: 50
Total threads: 400,Total memory: 1.50 TiB

0,1
Comm: tls://10.4.9.44:8786,Workers: 50
Dashboard: http://10.4.9.44:8787/status,Total threads: 400
Started: 37 minutes ago,Total memory: 1.50 TiB

0,1
Comm: tls://10.4.9.94:36373,Total threads: 8
Dashboard: http://10.4.9.94:43337/status,Memory: 30.74 GiB
Nanny: tls://10.4.9.94:40017,
Local directory: /dask-worker-space/worker-3k55ljif,Local directory: /dask-worker-space/worker-3k55ljif

0,1
Comm: tls://10.4.10.62:35267,Total threads: 8
Dashboard: http://10.4.10.62:43433/status,Memory: 30.74 GiB
Nanny: tls://10.4.10.62:35893,
Local directory: /dask-worker-space/worker-19dtlae8,Local directory: /dask-worker-space/worker-19dtlae8

0,1
Comm: tls://10.4.9.231:44029,Total threads: 8
Dashboard: http://10.4.9.231:44295/status,Memory: 30.74 GiB
Nanny: tls://10.4.9.231:35947,
Local directory: /dask-worker-space/worker-s5akmb7y,Local directory: /dask-worker-space/worker-s5akmb7y

0,1
Comm: tls://10.4.12.247:40495,Total threads: 8
Dashboard: http://10.4.12.247:35017/status,Memory: 30.74 GiB
Nanny: tls://10.4.12.247:40581,
Local directory: /dask-worker-space/worker-a_04jv3y,Local directory: /dask-worker-space/worker-a_04jv3y

0,1
Comm: tls://10.4.1.110:45589,Total threads: 8
Dashboard: http://10.4.1.110:34597/status,Memory: 30.74 GiB
Nanny: tls://10.4.1.110:42675,
Local directory: /dask-worker-space/worker-5zycwsa3,Local directory: /dask-worker-space/worker-5zycwsa3

0,1
Comm: tls://10.4.14.180:45463,Total threads: 8
Dashboard: http://10.4.14.180:32985/status,Memory: 30.74 GiB
Nanny: tls://10.4.14.180:34733,
Local directory: /dask-worker-space/worker-yrkrtkvy,Local directory: /dask-worker-space/worker-yrkrtkvy

0,1
Comm: tls://10.4.0.88:44491,Total threads: 8
Dashboard: http://10.4.0.88:41735/status,Memory: 30.74 GiB
Nanny: tls://10.4.0.88:45801,
Local directory: /dask-worker-space/worker-2pff1l6r,Local directory: /dask-worker-space/worker-2pff1l6r

0,1
Comm: tls://10.4.11.156:38629,Total threads: 8
Dashboard: http://10.4.11.156:43005/status,Memory: 30.74 GiB
Nanny: tls://10.4.11.156:41485,
Local directory: /dask-worker-space/worker-8mo0ouw2,Local directory: /dask-worker-space/worker-8mo0ouw2

0,1
Comm: tls://10.4.11.155:45345,Total threads: 8
Dashboard: http://10.4.11.155:38505/status,Memory: 30.74 GiB
Nanny: tls://10.4.11.155:42711,
Local directory: /dask-worker-space/worker-wxp9bf9n,Local directory: /dask-worker-space/worker-wxp9bf9n

0,1
Comm: tls://10.4.9.228:40701,Total threads: 8
Dashboard: http://10.4.9.228:35271/status,Memory: 30.74 GiB
Nanny: tls://10.4.9.228:44877,
Local directory: /dask-worker-space/worker-0le0b8it,Local directory: /dask-worker-space/worker-0le0b8it

0,1
Comm: tls://10.4.11.82:37753,Total threads: 8
Dashboard: http://10.4.11.82:39167/status,Memory: 30.74 GiB
Nanny: tls://10.4.11.82:37469,
Local directory: /dask-worker-space/worker-djb28v1p,Local directory: /dask-worker-space/worker-djb28v1p

0,1
Comm: tls://10.4.5.196:37703,Total threads: 8
Dashboard: http://10.4.5.196:45589/status,Memory: 30.74 GiB
Nanny: tls://10.4.5.196:45617,
Local directory: /dask-worker-space/worker-h075jtas,Local directory: /dask-worker-space/worker-h075jtas

0,1
Comm: tls://10.4.12.24:35949,Total threads: 8
Dashboard: http://10.4.12.24:44087/status,Memory: 30.74 GiB
Nanny: tls://10.4.12.24:44833,
Local directory: /dask-worker-space/worker-wbfpq4nb,Local directory: /dask-worker-space/worker-wbfpq4nb

0,1
Comm: tls://10.4.10.208:46089,Total threads: 8
Dashboard: http://10.4.10.208:34703/status,Memory: 30.74 GiB
Nanny: tls://10.4.10.208:40707,
Local directory: /dask-worker-space/worker-8l2ch6i6,Local directory: /dask-worker-space/worker-8l2ch6i6

0,1
Comm: tls://10.4.2.128:34531,Total threads: 8
Dashboard: http://10.4.2.128:34875/status,Memory: 30.74 GiB
Nanny: tls://10.4.2.128:34033,
Local directory: /dask-worker-space/worker-kpo2lnph,Local directory: /dask-worker-space/worker-kpo2lnph

0,1
Comm: tls://10.4.15.4:44707,Total threads: 8
Dashboard: http://10.4.15.4:40715/status,Memory: 30.74 GiB
Nanny: tls://10.4.15.4:40973,
Local directory: /dask-worker-space/worker-mlmr5ua1,Local directory: /dask-worker-space/worker-mlmr5ua1

0,1
Comm: tls://10.4.9.165:35215,Total threads: 8
Dashboard: http://10.4.9.165:40027/status,Memory: 30.74 GiB
Nanny: tls://10.4.9.165:39089,
Local directory: /dask-worker-space/worker-tbzwkiyx,Local directory: /dask-worker-space/worker-tbzwkiyx

0,1
Comm: tls://10.4.13.33:34231,Total threads: 8
Dashboard: http://10.4.13.33:46405/status,Memory: 30.74 GiB
Nanny: tls://10.4.13.33:34209,
Local directory: /dask-worker-space/worker-lr4s65yr,Local directory: /dask-worker-space/worker-lr4s65yr

0,1
Comm: tls://10.4.14.235:39449,Total threads: 8
Dashboard: http://10.4.14.235:35885/status,Memory: 30.74 GiB
Nanny: tls://10.4.14.235:35983,
Local directory: /dask-worker-space/worker-ang5lx64,Local directory: /dask-worker-space/worker-ang5lx64

0,1
Comm: tls://10.4.11.46:33157,Total threads: 8
Dashboard: http://10.4.11.46:45277/status,Memory: 30.74 GiB
Nanny: tls://10.4.11.46:34851,
Local directory: /dask-worker-space/worker-l6qstuen,Local directory: /dask-worker-space/worker-l6qstuen

0,1
Comm: tls://10.4.1.157:43357,Total threads: 8
Dashboard: http://10.4.1.157:37619/status,Memory: 30.74 GiB
Nanny: tls://10.4.1.157:45843,
Local directory: /dask-worker-space/worker-m1d3wqkh,Local directory: /dask-worker-space/worker-m1d3wqkh

0,1
Comm: tls://10.4.9.75:39251,Total threads: 8
Dashboard: http://10.4.9.75:40773/status,Memory: 30.74 GiB
Nanny: tls://10.4.9.75:33175,
Local directory: /dask-worker-space/worker-sjb3h5cg,Local directory: /dask-worker-space/worker-sjb3h5cg

0,1
Comm: tls://10.4.12.140:37823,Total threads: 8
Dashboard: http://10.4.12.140:44953/status,Memory: 30.74 GiB
Nanny: tls://10.4.12.140:37149,
Local directory: /dask-worker-space/worker-of46qnbe,Local directory: /dask-worker-space/worker-of46qnbe

0,1
Comm: tls://10.4.2.125:38513,Total threads: 8
Dashboard: http://10.4.2.125:43323/status,Memory: 30.74 GiB
Nanny: tls://10.4.2.125:44587,
Local directory: /dask-worker-space/worker-dfb3xpu1,Local directory: /dask-worker-space/worker-dfb3xpu1

0,1
Comm: tls://10.4.10.134:35601,Total threads: 8
Dashboard: http://10.4.10.134:40571/status,Memory: 30.74 GiB
Nanny: tls://10.4.10.134:44153,
Local directory: /dask-worker-space/worker-cwrgive2,Local directory: /dask-worker-space/worker-cwrgive2

0,1
Comm: tls://10.4.10.6:43085,Total threads: 8
Dashboard: http://10.4.10.6:37257/status,Memory: 30.74 GiB
Nanny: tls://10.4.10.6:36783,
Local directory: /dask-worker-space/worker-9jixjwbj,Local directory: /dask-worker-space/worker-9jixjwbj

0,1
Comm: tls://10.4.3.248:35411,Total threads: 8
Dashboard: http://10.4.3.248:37553/status,Memory: 30.74 GiB
Nanny: tls://10.4.3.248:33625,
Local directory: /dask-worker-space/worker-xboura4g,Local directory: /dask-worker-space/worker-xboura4g

0,1
Comm: tls://10.4.15.175:43639,Total threads: 8
Dashboard: http://10.4.15.175:43155/status,Memory: 30.74 GiB
Nanny: tls://10.4.15.175:38489,
Local directory: /dask-worker-space/worker-otikx9a9,Local directory: /dask-worker-space/worker-otikx9a9

0,1
Comm: tls://10.4.11.144:40423,Total threads: 8
Dashboard: http://10.4.11.144:45483/status,Memory: 30.74 GiB
Nanny: tls://10.4.11.144:33135,
Local directory: /dask-worker-space/worker-d5w34ayy,Local directory: /dask-worker-space/worker-d5w34ayy

0,1
Comm: tls://10.4.8.113:34211,Total threads: 8
Dashboard: http://10.4.8.113:42781/status,Memory: 30.74 GiB
Nanny: tls://10.4.8.113:42951,
Local directory: /dask-worker-space/worker-pb6y7_lx,Local directory: /dask-worker-space/worker-pb6y7_lx

0,1
Comm: tls://10.4.13.120:40655,Total threads: 8
Dashboard: http://10.4.13.120:41377/status,Memory: 30.74 GiB
Nanny: tls://10.4.13.120:39653,
Local directory: /dask-worker-space/worker-w4am8bmo,Local directory: /dask-worker-space/worker-w4am8bmo

0,1
Comm: tls://10.4.0.194:44389,Total threads: 8
Dashboard: http://10.4.0.194:44183/status,Memory: 30.74 GiB
Nanny: tls://10.4.0.194:40357,
Local directory: /dask-worker-space/worker-x9x5qska,Local directory: /dask-worker-space/worker-x9x5qska

0,1
Comm: tls://10.4.9.31:35535,Total threads: 8
Dashboard: http://10.4.9.31:40939/status,Memory: 30.74 GiB
Nanny: tls://10.4.9.31:38903,
Local directory: /dask-worker-space/worker-y4l9x79m,Local directory: /dask-worker-space/worker-y4l9x79m

0,1
Comm: tls://10.4.15.146:43065,Total threads: 8
Dashboard: http://10.4.15.146:33369/status,Memory: 30.74 GiB
Nanny: tls://10.4.15.146:42759,
Local directory: /dask-worker-space/worker-van0xhj7,Local directory: /dask-worker-space/worker-van0xhj7

0,1
Comm: tls://10.4.1.114:44731,Total threads: 8
Dashboard: http://10.4.1.114:37629/status,Memory: 30.74 GiB
Nanny: tls://10.4.1.114:40451,
Local directory: /dask-worker-space/worker-vqvan25n,Local directory: /dask-worker-space/worker-vqvan25n

0,1
Comm: tls://10.4.13.251:34019,Total threads: 8
Dashboard: http://10.4.13.251:40111/status,Memory: 30.74 GiB
Nanny: tls://10.4.13.251:41135,
Local directory: /dask-worker-space/worker-p0h9qvlt,Local directory: /dask-worker-space/worker-p0h9qvlt

0,1
Comm: tls://10.4.1.0:41935,Total threads: 8
Dashboard: http://10.4.1.0:46179/status,Memory: 30.74 GiB
Nanny: tls://10.4.1.0:33669,
Local directory: /dask-worker-space/worker-em6tg9vm,Local directory: /dask-worker-space/worker-em6tg9vm

0,1
Comm: tls://10.4.0.177:36579,Total threads: 8
Dashboard: http://10.4.0.177:43203/status,Memory: 30.74 GiB
Nanny: tls://10.4.0.177:44559,
Local directory: /dask-worker-space/worker-a6rv72oj,Local directory: /dask-worker-space/worker-a6rv72oj

0,1
Comm: tls://10.4.15.34:41719,Total threads: 8
Dashboard: http://10.4.15.34:38079/status,Memory: 30.74 GiB
Nanny: tls://10.4.15.34:46695,
Local directory: /dask-worker-space/worker-lav1kyjc,Local directory: /dask-worker-space/worker-lav1kyjc

0,1
Comm: tls://10.4.12.58:39807,Total threads: 8
Dashboard: http://10.4.12.58:42651/status,Memory: 30.74 GiB
Nanny: tls://10.4.12.58:38843,
Local directory: /dask-worker-space/worker-v40wpy6h,Local directory: /dask-worker-space/worker-v40wpy6h

0,1
Comm: tls://10.4.2.44:44925,Total threads: 8
Dashboard: http://10.4.2.44:42073/status,Memory: 30.74 GiB
Nanny: tls://10.4.2.44:45179,
Local directory: /dask-worker-space/worker-4zu8kzq8,Local directory: /dask-worker-space/worker-4zu8kzq8

0,1
Comm: tls://10.4.0.150:37865,Total threads: 8
Dashboard: http://10.4.0.150:34423/status,Memory: 30.74 GiB
Nanny: tls://10.4.0.150:44317,
Local directory: /dask-worker-space/worker-sr6pngkx,Local directory: /dask-worker-space/worker-sr6pngkx

0,1
Comm: tls://10.4.5.223:42287,Total threads: 8
Dashboard: http://10.4.5.223:46241/status,Memory: 30.74 GiB
Nanny: tls://10.4.5.223:37529,
Local directory: /dask-worker-space/worker-mk5wm052,Local directory: /dask-worker-space/worker-mk5wm052

0,1
Comm: tls://10.4.0.67:42499,Total threads: 8
Dashboard: http://10.4.0.67:46627/status,Memory: 30.74 GiB
Nanny: tls://10.4.0.67:46605,
Local directory: /dask-worker-space/worker-b1lf9trm,Local directory: /dask-worker-space/worker-b1lf9trm

0,1
Comm: tls://10.4.8.26:44221,Total threads: 8
Dashboard: http://10.4.8.26:40509/status,Memory: 30.74 GiB
Nanny: tls://10.4.8.26:43135,
Local directory: /dask-worker-space/worker-dvidaade,Local directory: /dask-worker-space/worker-dvidaade

0,1
Comm: tls://10.4.1.64:40263,Total threads: 8
Dashboard: http://10.4.1.64:46227/status,Memory: 30.74 GiB
Nanny: tls://10.4.1.64:34591,
Local directory: /dask-worker-space/worker-l5thxdeu,Local directory: /dask-worker-space/worker-l5thxdeu

0,1
Comm: tls://10.4.3.119:40731,Total threads: 8
Dashboard: http://10.4.3.119:42653/status,Memory: 30.74 GiB
Nanny: tls://10.4.3.119:45369,
Local directory: /dask-worker-space/worker-uhlacvu_,Local directory: /dask-worker-space/worker-uhlacvu_

0,1
Comm: tls://10.4.12.65:42521,Total threads: 8
Dashboard: http://10.4.12.65:34537/status,Memory: 30.74 GiB
Nanny: tls://10.4.12.65:33287,
Local directory: /dask-worker-space/worker-mcv2blco,Local directory: /dask-worker-space/worker-mcv2blco

0,1
Comm: tls://10.4.14.128:46217,Total threads: 8
Dashboard: http://10.4.14.128:42057/status,Memory: 30.74 GiB
Nanny: tls://10.4.14.128:38751,
Local directory: /dask-worker-space/worker-6rflt9rq,Local directory: /dask-worker-space/worker-6rflt9rq

0,1
Comm: tls://10.4.12.152:46643,Total threads: 8
Dashboard: http://10.4.12.152:36689/status,Memory: 30.74 GiB
Nanny: tls://10.4.12.152:38425,
Local directory: /dask-worker-space/worker-pl_wv5j4,Local directory: /dask-worker-space/worker-pl_wv5j4


In [5]:
cluster.scale(10)

## 2. Load Clean Twitter Data

The cleaned dataset contains ~6 million tweets = ~650MB.

Let's load in with Dask to save time then load only the tweet contents into local memory.

- `df_full` contains the unlemmatized unique tweets (works with sklearn hopefully)
- `df` contains the lemmatized tweets (works with Gensim)

In [6]:
# read in cleaned, full-text data (only tweet_text column)
df_full = dd.read_parquet(
    's3://coiled-datasets/arabic-tweets/unique_tweets_whole.parquet',
    columns=['tweet_text', 'hashtags', 'is_retweet', 'retweet_tweetid'],
    engine='pyarrow',
).repartition(npartitions=4)

In [7]:
df_full.head()

Unnamed: 0,tweet_text,hashtags,is_retweet,retweet_tweetid
0,السلام عليكم ورحمة الله وبركاته مرحبا عملاء م...,,True,9.986493e+17
1,للتأجير لبيع النطيطات زحاليق مائيه صابونية مل...,"[للتأجير, لبيع النطيطات, زحاليق مائيه صابونية,...",True,9.996373e+17
2,مظلات وسواتر آفاق الرياض مظلات استراحات مظلات...,"[مظلات, آفاق الرياض, مظلات استراحات, مظلات مسا...",True,9.993939e+17
3,فيديو شاهد مواطن يوثق بالفيديو كميات كبيرة من...,,True,9.983516e+17
4,أستغفر الله العظيم وأتوب إليه,,False,


In [8]:
df_full

Unnamed: 0_level_0,tweet_text,hashtags,is_retweet,retweet_tweetid
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,bool,float64
,...,...,...,...
,...,...,...,...
,...,...,...,...
,...,...,...,...


In [4]:
# load in cleaned AND lemmatized data
df = dd.read_parquet(
    's3://coiled-datasets/arabic-tweets/arabic_twitter_clean.parquet',
)

In [5]:
df.head()

Unnamed: 0,tweet_text,hashtags,is_retweet,retweet_tweetid,timestamp_first,user_reference_id
0,"[سَلام_1, عَلَى_1, رَحْمَة_1, اللَّه_1, بَرَكَ...",,True,9.986493e+17,2018-05-25 00:15:00,58
1,"[تَأْجِير_1, بَيْع_1, النطيطات_0, زحاليق_0, ما...","[للتأجير, لبيع النطيطات, زحاليق مائيه صابونية,...",True,9.996373e+17,2018-04-17 12:22:00,0
2,"[مِظَلَّة_1, ساتِر_1, أُفُق_1, رِياض_1, مِظَلّ...","[مظلات, آفاق الرياض, مظلات استراحات, مظلات مسا...",True,9.993939e+17,2018-05-25 00:15:00,58
3,"[فِيدْيُو_1, شاهَد_1, مُواطِن_1, وَثِق-ia_1, ف...",,True,9.983516e+17,2018-05-25 13:06:00,1
4,"[ٱِسْتَغْفَر_1, اللَّه_1, عَظِيم_2, تاب-u_1]",,False,,2014-04-12 03:34:00,657


In [6]:
# get only tweet content
docs = df.tweet_text

This Dask Series is about 70MB in size. That's small enough to load into local memory and continue working locally from there.

In [8]:
# load into local memory
docs = docs.compute()

In [10]:
type(docs)

pandas.core.series.Series

In [11]:
docs

0          [سَلام_1, عَلَى_1, رَحْمَة_1, اللَّه_1, بَرَكَ...
1          [تَأْجِير_1, بَيْع_1, النطيطات_0, زحاليق_0, ما...
2          [مِظَلَّة_1, ساتِر_1, أُفُق_1, رِياض_1, مِظَلّ...
3          [فِيدْيُو_1, شاهَد_1, مُواطِن_1, وَثِق-ia_1, ف...
4               [ٱِسْتَغْفَر_1, اللَّه_1, عَظِيم_2, تاب-u_1]
                                 ...                        
6145778    [أنا_1_0, قلب_3_0, تركي_1_0, ال_1_0, شيخ_2_0, ...
6145779    [أخت_1_0, جوز_2_0, شافه_1_0, طالع_1_0, مسجد_1_...
6145780    [رمضان_1_0, كريم_1_0, الدحيل_0_0, عين_1_0, قدس...
6145781    [رسول_1_0, الله_1_0, جمعة_1_0, ساعة_1_0, وافق_...
6145782    [إنجاز_2_0, شخص_1_0, عضو_1_0, شنو_0_0, إنجاز_2...
Name: tweet_text, Length: 6145783, dtype: object

## 3. Run Arabic Preprocessing with Dask Bags

In [None]:
tweets = df_full['tweet_text'].compute()

### Remove Repeating Characters

In [None]:
# define function
def remove_repeating_char(text):
    return re.sub("(.)\\1{2,}", "\\1", text)

### Orthographic Normalization

In [11]:
def ortho_normalize(text):
    text = normalize_alef_maksura_ar(text)
    text = normalize_alef_ar(text)
    text = normalize_teh_marbuta_ar(text)
    return text

## X. Preprocessing with Dask Bags (not working)

In [35]:
# cast tweet texts into a Dask bag
bag = df_full['tweet_text'].to_bag(index=False)

In [19]:
# get number of items in bag
bag.count().compute()

6145783

In [36]:
t = bag.take(1)

In [37]:
t

(' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر ',)

In [38]:
type(t)

tuple

In [39]:
t[0]

' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر '

In [40]:
# extract value from tuple
def get_tweets(element):
    return element[0]

In [41]:
tweets = bag.map(get_tweets)

In [42]:
tweets.take(1)

(' ',)

In [27]:
t[0]

' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر '

In [25]:
type(t)

tuple

I think there's an issue with how the values are cast into the Bag. Seems like they're being cast as tuples when I actually just want the value. Is that what's tripping up the `bag.apply` and killing workers?

### Remove Repeating Characters

In [44]:
# remove repeating characters if character repeats more than once
def remove_repeating_char(text):
    return re.sub("(.)\\1{2,}", "\\1", text)

In [46]:
# apply regex function to contents of Dask bag
bag2 = db.map(remove_repeating_char, bag)

dask.bag<remove_repeating_char, npartitions=4>

In [47]:
bag2.take(1)

(' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر ',)

## 4. Tf-Idf Vectorizer with Sklearn

## 5. LDA with Sklearn

## Gensim

### Create BOW Dictionary with Gensim

In [17]:
import gensim

In [18]:
%%time
# create BOW dictionary
dictionary = gensim.corpora.Dictionary(docs)

CPU times: user 49.2 s, sys: 788 ms, total: 50 s
Wall time: 50.4 s


In [19]:
# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [20]:
%%time
# map docs to bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


CPU times: user 37.4 s, sys: 553 ms, total: 38 s
Wall time: 38 s


In [22]:
# inspect
bow_doc_300 = bow_corpus[300]

for i in range(len(bow_doc_300)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_300[i][0], 
                                                     dictionary[bow_doc_300[i][0]],
                                                     bow_doc_300[i][1]))

Word 175 ("تويتر_0") appears 1 time(s).
Word 253 ("بَرْنامَج_1") appears 2 time(s).
Word 912 ("تَخَلُّص_1") appears 1 time(s).
Word 1113 ("تَنْحِيف_1") appears 1 time(s).
Word 1242 ("حَقِيقِيّ_1") appears 1 time(s).
Word 1243 ("مُسْتَعِير_1") appears 1 time(s).
Word 1244 ("ٱِسْم_1") appears 1 time(s).
Word 1331 ("وَزْن_1") appears 1 time(s).
Word 1348 ("كِيلُو_1") appears 1 time(s).
Word 1676 ("الكورس_0") appears 1 time(s).
Word 1677 ("تَثْبِيت_1") appears 1 time(s).
Word 1680 ("وَرْس_1") appears 1 time(s).


### Run LDA with Gensim

In [23]:
%%time
lda_model_5 = gensim.models.LdaMulticore(bow_corpus, 
                                         num_topics=5, 
                                         id2word=dictionary, 
                                         passes=2, 
                                         workers=7,
                                         random_state=21)

CPU times: user 1min 10s, sys: 29.8 s, total: 1min 40s
Wall time: 2min 29s


### Visualize LDA with pyLDAvis

In [25]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [26]:
%%time
# prepare visualisation data
vis_data_LDA_5 = gensimvis.prepare(lda_model_5, bow_corpus, dictionary)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


CPU times: user 3min 57s, sys: 2.55 s, total: 4min
Wall time: 4min 3s


In [29]:
# create filepath to save HTML visualisation
filepath = "/Users/rpelgrim/Desktop/LDA_5.html"

In [30]:
# save visualisation to HTML in repo
pyLDAvis.save_html(vis_data_LDA_5, filepath)

Excellent. This works.

BUT I actually really want to get the SKLEARN version working so I can use Dask as a backend.

## SKlearn

### Vectorize

Vectorizing isn't possible at the moment because the cleaned dataframe contains numpy arrays of the lemmas. The `Vectorizers` expect a string per document. 

**TO DO: Try loading in the untokenized, cleaned tweet texts and Vectorizing those directly. NO >> Arabic-specific preprocessing to do. OR find a way to write custom preprocessor and tokenizers.**

To do that I'll probably have to:
- input custom preprocessors/tokenizers.
- input the list of stop words (we have it somewhere)
- 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
CountVec = CountVectorizer(ngram_range=(2,2))
Count_data = CountVec.fit_transform(docs)

## Dask-ML

The array of documents `X` is only 47MB. Doesn't make sense to use Dask-ML for this. Instead use `sklearn` tf-idf vectorizer and then train LDA in parallel with Dask backend.

In [10]:
# vectorize contents
from dask_ml.feature_extraction.text import HashingVectorizer
from dask_ml.feature_extraction.text import CountVectorizer

### Hashing Vectorizer

In [11]:
vect = HashingVectorizer(lowercase=False)

In [12]:
X = df_full['tweet_text'].to_dask_array(lengths=True)

In [18]:
X

Unnamed: 0,Array,Chunk
Bytes,46.89 MiB,46.89 MiB
Shape,"(6145783,)","(6145783,)"
Count,3 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 46.89 MiB 46.89 MiB Shape (6145783,) (6145783,) Count 3 Tasks 1 Chunks Type object numpy.ndarray",6145783  1,

Unnamed: 0,Array,Chunk
Bytes,46.89 MiB,46.89 MiB
Shape,"(6145783,)","(6145783,)"
Count,3 Tasks,1 Chunks
Type,object,numpy.ndarray


In [17]:
X[1].compute()

' للتأجير لبيع النطيطات زحاليق مائيه صابونية ملاعب صابونيه زحاليق في جدة ألعاب أولاد بنات بالرياض '

In [34]:
docs_vect = vect.fit_transform(docs)

In [35]:
docs_vect.compute_chunk_sizes()

TypeError: cannot use a string pattern on a bytes-like object

In [None]:
docs_local = docs_vect.compute().toarray()