In [1]:
# Initializing Spark
with open("setupPySpark.py", "r") as setup_file:
    exec(setup_file.read())


In [2]:
# Spark context
from pyspark.sql.session import SparkSession

spark = SparkSession(sc)

In [3]:
# SQL context
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

In [4]:
# Importing required functions
from util import read_file, read_folder
from pandas import Series, DataFrame
from util_spark import remove_stopwords_spark, detect_language_spark, flatten_list_of_tokens, spell_correct_tokens_spark, get_semantic_similarity_spark
from tokenization_spark import tokenize_sentence_nltk_spark
from pyspark.sql.functions import col
from pos_tagging_spark import run_treetagger_pos_tag_spark
from modeling_spark import run_word2vec_model_pyspark

In [5]:
# Reading input file(s) using python's default libraries
in_file = open("in_file.cfg").read()
in_file = in_file.split("\n")
patterns_file = in_file[5]
file_folder = in_file[4]
label = in_file[3]
column = in_file[2]
in_type = in_file[1]
in_file = in_file[0]
if file_folder == "file":
    strings = read_file(in_file, in_type = in_type)
    if in_type == "text":
        strings = tokenize_sentence_nltk(strings)
        strings = DataFrame(strings)[0]
    elif in_type == "html":
        timestamp = strings[2]
        meta_data = strings[1]
        strings = strings[0]
        strings[label] = meta_data["Comment"]
        labels = strings[label]
        strings = strings[column]
    else:
        if label in strings.columns:
            labels = strings[label]
        strings = strings[column]
else:
    strings = read_folder(in_file, in_type = in_type)
    patterns = Series([".*" + x + ".*" for x in open(patterns_file, 'r').readlines()])

In [6]:
# Appending conversation together and creating spark data frome
strings['conversation'] = strings['conversation'].apply(lambda x: ". ".join(x["Message"]))
sentenceDataFrame = spark.createDataFrame(strings)

In [7]:
# Creating list of sentences for each conversation
sentenceDataFrame = tokenize_sentence_nltk_spark(df = sentenceDataFrame, in_col = "conversation")

In [8]:
# Language identification and filtering
sentenceDataFrame = detect_language_spark(df = sentenceDataFrame, in_col = "conversation", out_col = "language")
sentenceDataFrame = sentenceDataFrame.where(col('language') == "en")

In [9]:
# POS tagging and lemmatization using TreeTagger
sentenceDataFrame = run_treetagger_pos_tag_spark(df = sentenceDataFrame, in_col = "conversation", out_col = "pos", get_lemma = True)

In [10]:
# Merging 2 consecutive words if a) Words are incorrectly spelled and b) Merged word is correctly spelled
sentenceDataFrame = spell_correct_tokens_spark(df = sentenceDataFrame, in_col = "pos")

In [11]:
# Flattening out token of rows and running word2vec model
sentenceDataFrame = flatten_list_of_tokens(sentenceDataFrame, in_col = "pos")
model, sentenceDataFrame = run_word2vec_model_pyspark(sentenceDataFrame, in_col = "pos", vec_size = 100, in_type = "tokens", out_col = "result")

In [12]:
# Collecting document vectors in a list
doc_vecs = []
for row in sentenceDataFrame.select('result').collect():
    doc_vecs = doc_vecs + [row['result']]

In [13]:
sim1 = get_semantic_similarity_spark(model)

In [14]:
sim1.head()

Unnamed: 0,for,this,in,move,have,your,point,guy,b2b,agree,...,no,we,let,ramification,when,away,and,today,the,cant
for,1.0,0.088893,0.172195,-0.322402,-0.046452,-0.219655,-0.110043,-0.117959,-0.00795,0.066608,...,-0.098829,-0.086558,0.045495,-0.139093,0.09395,-0.083673,-0.063062,-0.043828,0.126474,0.03878
this,0.088893,1.0,0.141377,0.081572,0.130635,0.150715,-0.107207,-0.239497,0.121205,0.032268,...,0.087727,0.188562,-0.029192,0.065307,0.017507,-0.021388,-0.220844,0.021283,0.02664,-0.009694
in,0.172195,0.141377,1.0,-0.196407,-0.1345,0.135836,-0.09318,-0.016781,0.064896,-0.233312,...,0.114512,0.047512,-0.249753,0.03334,-0.093499,0.074829,-0.058607,0.144752,0.011207,0.004448
move,-0.322402,0.081572,-0.196407,1.0,0.124379,0.142081,-0.001474,-0.024179,-0.039517,-0.001032,...,0.026875,0.166068,0.01384,0.226505,0.034336,-0.191075,0.168643,0.175773,0.113304,0.03605
have,-0.046452,0.130635,-0.1345,0.124379,1.0,-0.078544,0.052904,0.018566,0.022026,-0.025583,...,-0.009402,0.232484,0.042098,0.077349,0.049321,0.043578,0.07448,0.036704,0.094947,0.199208
