In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RS").getOrCreate()
df = spark.read.csv("/FileStore/tables/articles1.csv", inferSchema=True, header=True)
data2 = spark.read.csv("/FileStore/tables/articles2.csv", inferSchema=True, header=True)
data3 = spark.read.csv("/FileStore/tables/articles3.csv", inferSchema=True, header=True)

In [2]:
df = df.union(data2)

In [3]:
df = df.union(data3)

In [4]:
df.show()

In [5]:
df = df.select("id", "title", "content")

In [6]:
df.show()

In [7]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [8]:
from pyspark.sql.functions import col, udf

In [9]:
from pyspark.sql.types import IntegerType

In [10]:
tokenizer = Tokenizer(inputCol='content', outputCol = 'words')
regex_tokenizer = RegexTokenizer(inputCol='content', outputCol='words', pattern=r'[^a-zA-Z0-9]+')

In [11]:
count_tokens = udf(lambda words:len(words), IntegerType())

In [12]:
tokenized = tokenizer.transform(df)

In [13]:
tokenized.show()

In [14]:
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

In [15]:
regex_tokenizer= RegexTokenizer(inputCol='content', outputCol='words',pattern='[^a-zA-Z0-9]+').setMinTokenLength(2)
df=df.fillna("unknown", subset=["content","title"])
rg_tokenized = regex_tokenizer.transform(df)

In [16]:
rg_tokenized.show()

In [17]:
rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

In [18]:
from pyspark.ml.feature import StopWordsRemover

In [19]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')

In [20]:
remover.transform(rg_tokenized).show()

In [21]:
final_df = remover.transform(rg_tokenized).select(col('id'), col('title'), col('filtered').alias('words'))

In [22]:
final_df.show()

In [23]:
from pyspark.ml.feature import HashingTF, IDF

In [24]:
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')

In [25]:
featurized_data = hashing_tf.transform(final_df)
featurized_data.cache

In [26]:
featurized_data.show()

In [27]:
idf = IDF(inputCol='rawFeatures', outputCol='features')

In [28]:
idf_model = idf.fit(featurized_data)

In [29]:
rescaled_data = idf_model.transform(featurized_data)

In [30]:
ii_df = rescaled_data.select("id","title","words","features")

In [31]:
ii_df.first()