In [13]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

import sys
conf = SparkConf()
conf.set("spark.driver.memory", "100g")
conf.set("spark.executor.memory", "100g")
conf.set("spark.master", "local[30]")
conf.set("spark.driver.maxResultSize", "100g")
conf.set("spark.executor.heartbeatInterval","1000000000s")
conf.set("spark.network.timeout","1000000000s")
spark = SparkSession.builder.config(conf=conf).appName("convertProfile").getOrCreate()

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Tokenizer, RegexTokenizer, CountVectorizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover


from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.sql.types import *
from pyspark.sql.functions import lit, col, regexp_replace
from pyspark.sql.functions import split, explode




from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Tokenizer, RegexTokenizer, CountVectorizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType


from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.sql.types import *
from pyspark.sql.functions import lit, col, regexp_replace
from pyspark.sql.functions import split, explode


In [63]:
class PokeProfile():
    def __init__(self):
        """
        data can be downloaded at http://snap.stanford.edu/data/soc-Pokec.html
        or run:
        wget http://snap.stanford.edu/data/soc-pokec-relationships.txt.gz .
        wget http://snap.stanford.edu/data/soc-pokec-profiles.txt.gz .
        zcat soc-pokec-relationships.txt.gz > soc-pokec-relationships.txt
        zcat soc-pokec-profiles.txt.gz > soc-pokec-profiles.txt                        
        mkdir output # place to save output parquet
        mkdir vocabulary # place to save vocabulary (vector-word mapping)
        """
        self.userFile = "data/seminar_year.csv"
        
    def readProfile(self):
        """
        read the profile data
        """
        self.profiles_=spark.read.option("delimiter", ",").csv(self.userFile)
        self.profiles_ = self.profiles_.na.fill("")
        

    def formatHeaders(self, headers = [ "user_id","year","title"]):
        """
        provide the headers of the data frame
        """
        for c,n in zip(self.profiles_.columns,headers):
            self.profiles_ = self.profiles_.withColumnRenamed(c,n)        
            
    def tokenize(self, inputColProfile = "I_am_working_in_field", vocSize = 40, minDF = 1.0):
        """
        tokenize string column, count the occurence of words and then use the occurence of the top words as vector
        :type inputColProfile: str: column to extract the vector
        :type vocSize: int: number of words to count
        :type minDF: float: minimun document frequency of the word            
        :rtype: None

        """
        self.vocSize = vocSize        
        self.minDF = minDF
        self.inputColProfile = inputColProfile
        self.outputColProfile = "{}_words".format(self.inputColProfile)
        self.outputColProfileStop = "{}_words_stp".format(self.inputColProfile)        
        self.outputTokensColProfile = "{}_tokens".format(self.inputColProfile)
        self.outputTokensDenseColProfile = "{}_dense".format(self.inputColProfile)

        regexTokenizer = RegexTokenizer(inputCol=self.inputColProfile, outputCol=self.outputColProfile, pattern="\\W|\\d")
        self.profiles_ = regexTokenizer.transform(self.profiles_)

        remover = StopWordsRemover(inputCol=self.outputColProfile, outputCol=self.outputColProfileStop)
        self.profiles_ = remover.transform(self.profiles_)

        self.cv = CountVectorizer(inputCol=self.outputColProfileStop, 
                             outputCol=self.outputTokensColProfile, 
                             vocabSize=self.vocSize, minDF=self.minDF)
        try:
            self.model = self.cv.fit(self.profiles_)        
            self.profiles_ = self.model.transform(self.profiles_)        
            vector_udf = udf(lambda vector: vector.toArray().tolist(),ArrayType(DoubleType()))

            self.profiles_ = self.profiles_.withColumn(self.outputTokensDenseColProfile, vector_udf(self.outputTokensColProfile))
            self.profiles_ = self.profiles_.drop(self.inputColProfile)
            self.profiles_ = self.profiles_.drop(self.outputColProfile)
            self.profiles_ = self.profiles_.drop(self.outputColProfileStop)            
            self.profiles_ = self.profiles_.drop(self.outputTokensColProfile)
        except:
            print("Tokenizing {} Failed".format(self.inputColProfile))
            self.profiles_ = self.profiles_.drop(self.outputColProfile)
            
    def flattenVectorColumns(self):
        """
        convert from 
            col1=[0,1,2], col2=[0,1,2], col3=3, col4=0
            to
            col1.0,col1.1,col1.2,col2.0, col2.1,col2.2, col3, col4
        """
        stringColumns = self.listStringColumns(self.index, self.cnt_each)        
        stringColumns = [column + "_dense" for column in stringColumns]
        self.newColumns = [self.profiles_[column][i] for column in stringColumns for i in range(self.vocSize)]
        self.nonstringColumns = [column for column in self.profiles_.columns if column not in stringColumns]
        self.profiles_flatten = self.profiles_.select(["user_id", "year"] + self.newColumns)
#         self.profiles_flatten = self.profiles_.select(self.nonstringColumns + self.newColumns)        
#         self.profiles_flatten = self.profiles_flatten.drop("_c59")

    def saveVocabulary(self):
        """
        save the vocabulary to a separate file; 
        vocabulary can work as a look up table for the word given the index in the word vector        
        """
        import pandas as pd
        pd.DataFrame(self.model.vocabulary).to_csv("data/vocabulary/{}.txt".format(self.inputColProfile), 
                                                   sep='\t', encoding='utf-8', header=False)
        
    def listStringColumns(self, index = 0, cnt_each = 10):
        """
        list of string columns in the data
        """
        self.index = index
        self.cnt_each = cnt_each
        # "pets", "completed_level_of_education", ""
        all_columns = ["title"]
        self.cnt_string = len(all_columns) 
        start, end = index*cnt_each, (index+1)*cnt_each
        return all_columns[start:end]
    

    def saveOutput(self, data, outputfile = "soc-pokec-profiles-vector", save_format = "parquet"):
        """
        save data as parquet
        """
        if save_format == "parquet":
            data.repartition(1).write.parquet("{}.parquet".format(outputfile))
        else:
            data.repartition(1).write.csv("{}.csv".format(outputfile))

from pyspark.sql.functions import array, col, explode, struct, lit

df = sc.parallelize([(1, 0.0, 0.6), (1, 0.6, 0.7)]).toDF(["A", "col_1", "col_2"])

def to_long(df, by):

    # Filter dtypes and split into column names and type description
    cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by))
    # Spark SQL supports only homogeneous columns
    assert len(set(dtypes)) == 1, "All columns have to be of the same type"

    # Create and explode an array of (column_name, column_value) structs
    kvs = explode(array([
      struct(lit(c).alias("key"), col(c).alias("val")) for c in cols
    ])).alias("kvs")

    return df.select(by + [kvs]).select(by + ["kvs.key", "kvs.val"])

DataFrame[user_id: string, year: string, key: string, val: double]

In [73]:
PP = PokeProfile()

In [74]:
PP.readProfile()
PP.formatHeaders()

In [75]:
PP.profiles_.head(3)

[Row(user_id=u'Ang', year=u'2018', title=u'March with and Feet: The Talking About Protests and Beyond'),
 Row(user_id=u'Ang', year=u'2018', title=u'Keeping up on Current Events! A Case of Newcomers to Wikipedia'),
 Row(user_id=u'Ang', year=u'2018', title=u'Exploration of Online Health Support Groups Through the Lens of Sentiment Analysis')]

In [76]:
for i in range(1):
    print(i)
    for eachColumn in PP.listStringColumns(i, cnt_each = 1):
        PP.tokenize(inputColProfile = eachColumn, vocSize = 40)
        PP.saveVocabulary()
#             PP.flattenVectorColumns()
#             PP.profiles_flatten = PP.profiles_flatten.repartition(1)
#             PP.profiles_flatten.write.csv("output/soc-pokec-profiles-vector-{}.csv".format(PP.index), header = True, mode="overwrite")

0


In [77]:
PP.flattenVectorColumns()

In [81]:
headers = ["user_id", "year"] + PP.model.vocabulary
for c,n in zip(PP.profiles_flatten.columns,headers):
    PP.profiles_flatten = PP.profiles_flatten.withColumnRenamed(c,n)        


In [82]:
PP.profiles_flatten.head(3)

[Row(user_id=u'Ang', year=u'2018', networks=0.0, social=0.0, anomaly=0.0, twitter=0.0, dynamic=0.0, academic=0.0, hypersphere=0.0, conferences=0.0, detection=0.0, mobile=0.0, analysis=0.0, media=0.0, modeling=0.0, information=0.0, event=0.0, sentiment=0.0, unsupervised=0.0, beyond=1.0, robust=0.0, process=0.0, scale=0.0, march=1.0, feet=1.0, embedding=0.0, effect=0.0, discovery=0.0, method=0.0, protests=1.0, visual=0.0, seeking=0.0, data=0.0, role=0.0, large=0.0, analytics=0.0, talking=1.0, events=0.0, time=0.0, streaming=0.0, deep=0.0, regression=0.0),
 Row(user_id=u'Ang', year=u'2018', networks=0.0, social=0.0, anomaly=0.0, twitter=0.0, dynamic=0.0, academic=0.0, hypersphere=0.0, conferences=0.0, detection=0.0, mobile=0.0, analysis=0.0, media=0.0, modeling=0.0, information=0.0, event=0.0, sentiment=0.0, unsupervised=0.0, beyond=0.0, robust=0.0, process=0.0, scale=0.0, march=0.0, feet=0.0, embedding=0.0, effect=0.0, discovery=0.0, method=0.0, protests=0.0, visual=0.0, seeking=0.0, dat

In [83]:
PP.profiles_flatten = to_long(PP.profiles_flatten, ["user_id", "year"])

In [84]:
PP.profiles_flatten.head(3)

[Row(user_id=u'Ang', year=u'2018', key=u'networks', val=0.0),
 Row(user_id=u'Ang', year=u'2018', key=u'social', val=0.0),
 Row(user_id=u'Ang', year=u'2018', key=u'anomaly', val=0.0)]

In [86]:
PP.profiles_flatten = PP.profiles_flatten.repartition(1)
PP.profiles_flatten.write.csv("data/scholar_top_{}.csv".format(PP.vocSize), header = False, mode="overwrite")

In [93]:
import pandas as pd
self = PP
picso = pd.read_csv("data/picso.csv", header=None)
picso.columns = ['member', 'year', 'keyword', 'value']
# policy_group = policy.groupby(self.column)['adoption'].sum()
# policy_group1 = policy_group.unstack(fill_value=0).to_panel()
# self.hist = policy_group1.fillna(0).values


In [94]:
picso.head(3)

Unnamed: 0,0,1,2,3
0,Ang,2018,networks,0
1,Ang,2018,social,0
2,Ang,2018,anomaly,0


In [95]:
picso.columns

Int64Index([0, 1, 2, 3], dtype='int64')