In [1]:
import findspark

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('NLP_starter').getOrCreate()

In [5]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [6]:
from pyspark.sql.functions import udf 

In [7]:
from pyspark.sql.types import IntegerType

In [8]:
rdd = sc.textFile('/FileStore/tables/Purelyrics.txt/part-00000')

In [9]:
rdd.take(10)

In [10]:
rdd1 = rdd.zipWithIndex()
rdd2 = rdd1.map(lambda s:(s[1],s[0]))
rdd2.take(10)

In [11]:
from pyspark.sql.types import *
schema = StructType([StructField('id', IntegerType(), True),StructField('lyrics', StringType(), True)])
from pyspark.sql import SQLContext
# create dataframe
df = sqlContext.createDataFrame(rdd2, schema)

In [12]:
df.show(5)

In [13]:
tokenizer = Tokenizer(inputCol='lyrics', outputCol='words')

## More on regular expressions with Python
More on:
https://docs.python.org/3/library/re.html

### \W
Matches any character which is not a word character. This is the opposite of \w. If the ASCII flag is used this becomes the equivalent of [^a-zA-Z0-9_] (but the flag affects the entire regular expression, so in such cases using an explicit [^a-zA-Z0-9_] may be a better choice).

In [15]:
count_words = udf(lambda words: len(words), IntegerType())

In [16]:
tokenized_df = tokenizer.transform(df)

In [17]:
tokenized_df.show(truncate=False)

In [18]:
tokenized_output = tokenized_df.collect()

In [19]:
for token in tokenized_output[4]:
    print (token)

In [20]:
tokenized_df.head()

In [21]:
tokenized_df.withColumn('counts', count_words('words')).show()

In [22]:
regex_tokenizer = RegexTokenizer(inputCol='lyrics', outputCol='words', pattern='\\W')
#regex_tokenizer.setMinTokenLength(4)

In [23]:
regex_df = regex_tokenizer.transform(df)

In [24]:
regex_tokenized_counts = regex_df.withColumn('freq', count_words('words'))

In [25]:
regex_tokenized_counts.show()

In [26]:
from pyspark.ml.feature import StopWordsRemover

In [27]:
remover = StopWordsRemover(inputCol='words', outputCol='tokens')

In [28]:
tokens_filtered = remover.transform(regex_tokenized_counts)

In [29]:
cleanDF= tokens_filtered.withColumn('count_tokens', count_words('tokens'))


In [30]:
cleanDF.select('words', 'freq', 'tokens', 'count_tokens').show()

In [31]:
cleanDF.show()

In [32]:
remover.getStopWords()

In [33]:
for item in cleanDF.collect()[4]:
   print(item)    

In [34]:
stopWords=['a', 'is', 'for', 'hi', 'in', 'on','row','lyrics','u']
remover.setStopWords(stopWords)


In [35]:
remover.transform(regex_tokenized_counts).select('lyrics', 'tokens').show(truncate=False)

In [36]:
newCleanDF=remover.transform(regex_tokenized_counts).withColumn('count_tokens', count_words('tokens'))
newCleanDF.show()


In [37]:
from pyspark.ml.feature import NGram

In [38]:
ngram = NGram(n=2, inputCol='tokens', outputCol='2grams')

In [39]:
#my_2ngrams =ngram.transform(cleanDF)
my_2ngrams =ngram.transform(newCleanDF)

In [40]:
 my_2ngrams.show()

In [41]:
my_2ngrams.select('2grams').show(truncate =False)

In [42]:
from pyspark.ml.feature import HashingTF, IDF

In [43]:
cleanDF.show()

In [44]:
tf = HashingTF(inputCol='tokens', outputCol='features')

# TF: Term Frequency
Maps a sequence of terms to their term frequencies using the hashing trick.

Note: the terms must be hashable (can not be dictionary or list...).

HashingTF(S) takes the hash code of each word modulo the desired vector size S, and thus maps each word to a number between 0 and S-1.

This yields a quite robust vector even if multiple words may map to the same hash code. 

The MLib developers recommend setting S between 2^18 and 2^20

In [46]:
tf.explainParams()


In [47]:
print (tf.getNumFeatures())

In [48]:
tf_df = tf.transform(newCleanDF)

In [49]:
tf_df.show()

In [50]:
idf = IDF(inputCol='features', outputCol='idf_features')

# TF-IDF: Term Frequence - Inverse Document Frequency

Once you have TF vectors, you can use IDF to compute the inverse document frequencies and multiply them with the TF to compute the TF-IDF 

IDF measures how infrequently a term occurs across the whole document corpus

TF x IDF shows how relevant a term is to specific document (i.e., if it is common in that document but rare in the whole corpus)

TF-IDF is used to improve on Bag of Words by adjusting word counts based on their frequency in the corpus

## How to calculate them?
Various ways for determining the exact values of both statistics exist:

- TF(x, y): number of occurences of term x in document y. It represents the importance of a term in the document. 

- IDF(t): Importance of the term in the document. 

- IDF(t)= log(N/N(t))
- N: number of documents in the corpus D, N=|D|
- N(t): Number of the documents where the term t appears (i.e: TF(t, d)!=0). N(t)= |{d in D, t in D}|
- TF-IDF(t, d) =TF(t, d)  

## Further Information on TF-IDF
https://fr.wikipedia.org/wiki/TF-IDF

## Preparing Data to TF-IDF
In a real pipeline, you will likely need to preprocess and stem words before passing them to TF.

Ex: convert words to lowercase, drop punctuation characters or drop suffixes like ‘ing’.

You can use external single node natural language libraries like NLTK (http://www.nltk.org)

In [52]:
idf_model = idf.fit(tf_df)

In [53]:
data=idf_model.transform(tf_df)

In [54]:
data.show()

In [55]:
from pyspark.ml.feature import CountVectorizer

In [56]:
count_vec = CountVectorizer(inputCol='tokens', outputCol='features',  minDF=1)

In [57]:
#help(   CountVectorizer)

In [58]:
model = count_vec.fit(newCleanDF)

In [59]:
data = model.transform(newCleanDF)

In [60]:
data.select(['tokens', 'features']).show(truncate = False)

In [61]:
data.rdd.saveAsTextFile('/FileStore/tables/feature')

In [62]:
count_vec = CountVectorizer(inputCol='tokens', outputCol='features', minDF=2)

In [63]:
count_vec.fit(cleanDF).transform(cleanDF).select('tokens', 'features').show(truncate = False)

In [64]:
count_vec = CountVectorizer(inputCol='tokens', outputCol='features', vocabSize=15, minDF=1)

In [65]:
count_vec.fit(cleanDF).transform(cleanDF).select('tokens', 'features').show(truncate = False)