In [None]:
import dask.dataframe as dd
import spacy
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tqdm import tqdm
from dask.diagnostics import ProgressBar
from dask.callbacks import Callback
import re


# load spacy model
nlp = spacy.load("en_core_web_sm")

In [None]:
class ProgressBar(Callback):
    def __init__(self, desc=""):
        self.desc = desc

    def _start_state(self, dsk, state):
        self._tqdm = tqdm(total=sum(len(state[k]) for k in ['ready', 'waiting', 'running', 'finished']), desc=self.desc)

    def _posttask(self, key, result, dsk, state, worker_id):
        self._tqdm.update(1)

    def _finish(self, dsk, state, errored):
        pass

In [None]:
def clean_text(text):
    text = str(text)
    if text not in ['nan', 'None']:
        text = text.replace('[^a-z\s]',' ')
        doc = nlp(text)
        words = [token.text for token in doc if not token.is_stop]
        text = " ".join(words)
        return text
    else:
        return None

In [None]:
# Read the Parquet file into a Dask DataFrame
df = dd.read_csv('/Users/roopakkprajapat/Downloads/invoice.csv')
df.head(10)

In [None]:
def run_parallely(x, column, fun):
    x[column] = x[column].apply(fun)
    return x

In [None]:
df = df.dropna(subset=['Description'])
df = df.assign(cleaned_text=df['Description'].str.lower())


In [None]:
df.head(10)

In [None]:
df = df.map_partitions(lambda x: run_parallely(x, 'cleaned_text', clean_text), meta=df)

In [None]:
with ProgressBar():
    df.to_parquet('claned_text.parquet').compute()

In [None]:
df = dd.read_parquet('claned_text.parquet')
df.head(10)

In [None]:
with ProgressBar():
    text = df['cleaned_text'].compute().str.cat(sep=' ')

In [None]:
# Create a word cloud from the computed text
wordcloud = WordCloud().generate(text)

# Show the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
with ProgressBar():
    text = df['GLDesc'].compute().str.cat(sep=' ')

# Create a word cloud from the computed text
wordcloud = WordCloud().generate(text)

# Show the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df = dd.read_csv('/Users/roopakkprajapat/Downloads/po.csv')
df.columns = ['Client','Desc','1','2','3','4']
df.head(10)

In [None]:
with ProgressBar():
    text = df['Desc'].compute().str.cat(sep=' ')

In [None]:
# Create a word cloud from the computed text
wordcloud = WordCloud().generate(text)

# Show the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df = dd.read_csv('/Users/roopakkprajapat/Downloads/payments.csv', dtype={'Client':'str', 'Desc':'str','1':'object','2':'object',
'3':'object','4':'object','5':'object'})
# df.columns = ['Client','Desc','1','2','3','4']

df.head(10)

In [None]:
with ProgressBar():
    text = df['Desc'].compute().str.cat(sep=' ')

# Create a word cloud from the computed text
wordcloud = WordCloud().generate(text)

# Show the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()