### 26 Lang gram frequency

In [19]:
import findspark
findspark.init("/home/pc/TestJupyter/opt/spark-3.3.0/spark-3.3.0-bin-hadoop3")
from pyspark.ml.feature import NGram
import pyspark.sql.functions as F
import pyspark
import random
import os
from pyspark.sql import SparkSession
from pyspark import SparkConf


os.environ["PYSPARK_PYTHON"]="/home/pc/TestJupyter/opt/spark-3.3.0/venv-spark/bin/python39"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.12:3.3.0  pyspark-shell'

spark = SparkSession.builder \
    .master("local[*]")\
    .appName("hive hbase")\
    .config("hive.metastore.uris", "thrift://g2.bigtop.it:9083")\
    .config("spark.driver.memory", "70g")\
    .enableHiveSupport()\
    .getOrCreate()

In [20]:
def read_avro(path):
    return spark.read.format("avro").load(path)

def write_avro(df, path):
    return df.write.format("avro").save(path)

In [21]:
df = read_avro("output/25_CENTER_DETERMINED")

In [22]:
langFreq = df.groupBy("lang_gram").count()

In [23]:
langFreq.show()



+----------------+------+
|       lang_gram| count|
+----------------+------+
|           ms zh|     8|
|     zh ms ms en|     3|
|           en en|503440|
|  ms ms ms ms en| 44840|
|        ms ms ms|394027|
|     ms en en ms| 12590|
|  ms en en ms en|  3969|
|  en en ms en ms|  8555|
|     zh ms en zh|    14|
|   zh zh oov oov|    19|
|en en oov oov ms|     5|
| zh zh oov zh zh|    11|
|  ms ms zh zh zh|    11|
|           en ms|237903|
|  zh ms en ms zh|     4|
|  zh ms zh en en|     3|
|     zh ms zh zh|   156|
|           zh ms|   272|
|     en en zh zh|    98|
|     en ms en en| 41795|
+----------------+------+
only showing top 20 rows



                                                                                

In [24]:
langFreq.withColumnRenamed("count", "lang_gram_f")

DataFrame[lang_gram: string, lang_gram_f: bigint]

In [9]:
import shutil
shutil.rmtree("output/26_LANG_G_FREQ")
write_avro(langFreq, "output/26_LANG_G_FREQ")

                                                                                

In [None]:
## join with center table

In [27]:
df = read_avro("output/25_CENTER_DETERMINED")

In [26]:
lf =  read_avro("output/26_LANG_G_FREQ").withColumnRenamed("count", "lang_gram_f")

In [23]:
lf

DataFrame[lang_gram: string, lang_gram_f: bigint]

In [28]:
mergeLangF = df.join(lf, df.lang_gram == lf.lang_gram).drop(lf.lang_gram)

In [29]:
mergeLangF

DataFrame[sentence_id: bigint, original: string, tag: array<string>, language: string, token: array<string>, chinese_tag: array<string>, language_pattern: array<string>, pos: int, token_gram: string, tag_gram: string, lang_gram: string, gram_type: string, containsKey: boolean, middle_key: boolean, lang_gram_f: bigint]

In [14]:
shutil.rmtree("output/26_WITH_LANGF")
write_avro(mergeLangF, "output/26_WITH_LANGF")

                                                                                

### 27 Token gram frequency

In [30]:
df = read_avro("output/26_WITH_LANGF")
df = df.repartition(54)
tokenFreq = df.groupBy("token_gram").count()

In [5]:
tokenFreq = tokenFreq.withColumnRenamed("count", "token_gram_f")

In [6]:
tokenFreq.show()



+----------------------+------------+
|            token_gram|token_gram_f|
+----------------------+------------+
|        seller so much|         217|
|               好 货品|         364|
|  seller courier fo...|           5|
|   不错 good 不错 good|          36|
|               不错 不|          28|
|  seller fast delivery|        8894|
|  seller pastikan i...|           1|
|               不错 跟|          15|
|  seller for being ...|           2|
|  好 太棒了 我 很 喜欢|           3|
|  cantik la tak rug...|           4|
|  好 慢 东西 好吃 good|           1|
|             包装 完好|         111|
|  cantik warnanya t...|           3|
|  包装 很 好 值得 购买|          38|
|  recommend this pr...|          37|
|包装 很 好 奈斯 店主人|           4|
|  喜欢 青稞 和 葡萄 味|           1|
|        seller is just|          10|
|  seller received w...|          98|
+----------------------+------------+
only showing top 20 rows



                                                                                

In [8]:
import shutil
shutil.rmtree("output/27_TOKEN_F")
write_avro(tokenFreq, "output/27_TOKEN_F")

                                                                                

In [32]:
tokF = read_avro("output/27_TOKEN_F")
tokF = tokF.repartition(54)

In [33]:
withTokF = df.join(tokF, df.token_gram == tokF.token_gram).drop(tokF.token_gram)

In [34]:
withTokF

DataFrame[sentence_id: bigint, original: string, tag: array<string>, language: string, token: array<string>, chinese_tag: array<string>, language_pattern: array<string>, pos: int, token_gram: string, tag_gram: string, lang_gram: string, gram_type: string, containsKey: boolean, middle_key: boolean, lang_gram_f: bigint, token_gram_f: bigint]

In [11]:
shutil.rmtree("output/27_TokFreq")

In [12]:
write_avro(withTokF, "output/27_TokFreq")

                                                                                

### 28 Tag gram frequency

In [35]:
df = read_avro("output/27_TokFreq")
df = df.repartition(54)
# tokenFreq = df.groupBy("token_gram").count()

In [36]:
tagGramFreq = df.groupBy("tag_gram").count().withColumnRenamed("count", "tag_gram_f")

In [5]:
write_avro(tagGramFreq, "28_TagGramF")

                                                                                

In [38]:
tf = read_avro("28_TagGramF")
tf = tf.repartition(54)

In [42]:
withtagF = df.join(tf, df.tag_gram == tf.tag_gram).drop(tf.tag_gram)

In [43]:
withtagF

DataFrame[sentence_id: bigint, original: string, tag: array<string>, language: string, token: array<string>, chinese_tag: array<string>, language_pattern: array<string>, pos: int, token_gram: string, tag_gram: string, lang_gram: string, gram_type: string, containsKey: boolean, middle_key: boolean, lang_gram_f: bigint, token_gram_f: bigint, tag_gram_f: bigint]

In [44]:
import shutil
shutil.rmtree("output/28_ALL_FREQ_INCLUDED")

In [45]:
write_avro(withtagF, "output/28_ALL_FREQ_INCLUDED")

                                                                                

In [46]:
read_avro("output/28_ALL_FREQ_INCLUDED").show()

+------------+--------------------+--------------------+--------+--------------------+-----------+--------------------+---+------------------+-----------------+---------+---------+-----------+----------+-----------+------------+----------+
| sentence_id|            original|                 tag|language|               token|chinese_tag|    language_pattern|pos|        token_gram|         tag_gram|lang_gram|gram_type|containsKey|middle_key|lang_gram_f|token_gram_f|tag_gram_f|
+------------+--------------------+--------------------+--------+--------------------+-----------+--------------------+---+------------------+-----------------+---------+---------+-----------+----------+-----------+------------+----------+
| 42949713335|barang sampai dal...|[NOUN, VERB, ADP,...|   malay|[barang, sampai, ...|       null|[ms, ms, ms, ms, ...|  0|     barang sampai|        NOUN VERB|    ms ms|        2|       true|     false|     486008|       61098|    118346|
|        7613|thankyouuuuu bara...|[NOUN

In [47]:
df = read_avro("output/28_ALL_FREQ_INCLUDED")
df = df.repartition(800)

In [48]:
df

DataFrame[sentence_id: bigint, original: string, tag: array<string>, language: string, token: array<string>, chinese_tag: array<string>, language_pattern: array<string>, pos: int, token_gram: string, tag_gram: string, lang_gram: string, gram_type: string, containsKey: boolean, middle_key: boolean, lang_gram_f: bigint, token_gram_f: bigint, tag_gram_f: bigint]

### 29 Table normalization - TABLE 1 normal data

In [51]:
originalSentence  = df.dropDuplicates((['sentence_id'])).select("sentence_id", "original", "tag", "language", "token", "chinese_tag", "language_pattern")
originalSentence = originalSentence.repartition(800)

In [52]:
originalSentence.show()



+------------+----------------------+--------------------+--------+----------------------------+------------------+--------------------+
| sentence_id|              original|                 tag|language|                       token|       chinese_tag|    language_pattern|
+------------+----------------------+--------------------+--------+----------------------------+------------------+--------------------+
| 42949689070|  ta seller recomme...|[NOUN, NOUN, VERB...| english|        [ta, seller, reco...|              null|[ms, en, en, ms, ...|
| 60129595600|  barang berkualiti...|[NOUN, ADJ, NOUN,...|   malay|        [barang, berkuali...|              null|[ms, ms, ms, ms, ...|
| 85899405706|  barang sampai den...|[NOUN, ADP, ADP, ...|   malay|        [barang, sampai, ...|              null|[ms, ms, ms, ms, ...|
|       34374|  nice packaging n ...|[PROPN, NOUN, CCO...| english|        [nice, packaging,...|              null|[en, en, ms, en, ...|
| 60129578321|  fast delivery and...|[PRO

                                                                                

In [53]:
shutil.rmtree("output/29_ORIGINAL_DATA")
write_avro(originalSentence, "output/29_ORIGINAL_DATA")

                                                                                

### 30 Table normalization - TABLE 2 gram-table

In [54]:
df

DataFrame[sentence_id: bigint, original: string, tag: array<string>, language: string, token: array<string>, chinese_tag: array<string>, language_pattern: array<string>, pos: int, token_gram: string, tag_gram: string, lang_gram: string, gram_type: string, containsKey: boolean, middle_key: boolean, lang_gram_f: bigint, token_gram_f: bigint, tag_gram_f: bigint]

In [55]:
gram_table = df.select("sentence_id", "pos", "token_gram", "token_gram_f", "tag_gram", "tag_gram_f", "lang_gram", "lang_gram_f", "gram_type", "middle_key", "language")

In [56]:
gram_table.show()



+------------+---+--------------------+------------+--------------------+----------+--------------+-----------+---------+----------+--------+
| sentence_id|pos|          token_gram|token_gram_f|            tag_gram|tag_gram_f|     lang_gram|lang_gram_f|gram_type|middle_key|language|
+------------+---+--------------------+------------+--------------------+----------+--------------+-----------+---------+----------+--------+
|120259128971|  8|         seller even|         914|            NOUN ADV|    149859|         en en|     503440|        2|     false| english|
| 51539636494|  9|     seller post out|        2586|      NOUN VERB NOUN|     45130|      en en en|     405635|        3|     false| english|
| 34359798809|  1|  seller baju selesa|          10|       NOUN NOUN ADJ|     37260|      en ms ms|     136178|        3|     false|   malay|
|137438966217|  4|             nice to|        4784|             ADJ ADP|     31361|         en en|     503440|        2|     false| english|
|11166

                                                                                

In [58]:
shutil.rmtree("30_GRAM_TABLE")
write_avro(gram_table, "30_GRAM_TABLE")

                                                                                