In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, lower, regexp_replace, desc, asc
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Normalizer
import numpy as np
import spacy
from scipy.spatial.distance import cosine

In [0]:
spark = SparkSession.builder \
    .appName("Text Processing Task") \
    .getOrCreate()

In [0]:
data_path = "/FileStore/tables/ukr_text.csv"
df = spark.read.csv(data_path, header=True)
df = df.select("id", "title", "body").na.drop()
df = df.withColumn(
    "body",
    regexp_replace(lower(col("body")), "[\\n\\r\\t]+|<[^>]*>", "")
)
df.show()

+--------------------+--------------------+--------------------+
|                  id|               title|                body|
+--------------------+--------------------+--------------------+
|http://k.img.com....|Кличко покликав н...|"київ - перспекти...|
|http://k.img.com....|З'явилося відео, ...|"   з'явилося від...|
|http://k.img.com....|У центрі Києва по...|у києві на бессар...|
|http://k.img.com....|Нічний ураган пер...|київ вночі 16 сер...|
|http://k.img.com....|Потоп у Києві: ст...|уночі київ вкотре...|
|http://k.img.com....|У Києві потрапив ...|"колишній народни...|
|http://k.img.com....|У Києві пограбува...|"у києві троє нев...|
|http://k.img.com....|У Києві обмежать ...|у києві на вихідн...|
|http://k.img.com....|"У Києві ""заміну...|"на майдані незал...|
|http://k.img.com....|У Києві посилять ...|у києві у зв'язку...|
|http://k.img.com....|У Києві інспектор...|у вересні набудут...|
|http://k.img.com....|Поліція затримала...|"поліція затримал...|
|http://k.img.com....|У К

In [0]:
!python -m spacy download uk_core_news_sm

Collecting uk-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/uk_core_news_sm-3.7.0/uk_core_news_sm-3.7.0-py3-none-any.whl (14.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/14.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/14.9 MB[0m [31m3.5 MB/s[0m eta [36m0:00:05[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/14.9 MB[0m [31m5.9 MB/s[0m eta [36m0:00:03[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/14.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:03[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/14.9 MB[0m [31m8.1 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/14.9 MB[0m [31m9.2 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━[0m[90m╺[0m[90m

In [0]:
tokenizer = Tokenizer(inputCol="body", outputCol="tokens")
tokenized_df = tokenizer.transform(df)

stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
filtered_df = stopwords_remover.transform(tokenized_df)

def remove_empty_elements(text):
    return [word.strip() for word in text if word.strip()]

remove_empty_udf = udf(remove_empty_elements, ArrayType(StringType()))
filtered_df = filtered_df.withColumn("filtered_tokens", remove_empty_udf(col("filtered_tokens")))

nlp = spacy.load("uk_core_news_sm")

def lemmatize_text(text):
    doc = nlp(" ".join(text))
    return [token.lemma_ for token in doc]

lemmatizer_udf = udf(lemmatize_text, ArrayType(StringType()))
lemmatized_df = filtered_df.withColumn("lemmatized", lemmatizer_udf(col("filtered_tokens")))

lemmatized_df.write.mode("overwrite").parquet("output/lemmatized.parquet")

lemmatized_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|                body|              tokens|     filtered_tokens|          lemmatized|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|http://k.img.com....|Кличко покликав н...|"київ - перспекти...|["київ, -, перспе...|["київ, -, перспе...|[", київ, -, перс...|
|http://k.img.com....|З'явилося відео, ...|"   з'явилося від...|[" , , з'явилося,...|[", з'явилося, ві...|[", з'явитися, ві...|
|http://k.img.com....|У центрі Києва по...|у києві на бессар...|[у, києві, на, бе...|[у, києві, на, бе...|[у, кий, на, бесс...|
|http://k.img.com....|Нічний ураган пер...|київ вночі 16 сер...|[київ, вночі, 16,...|[київ, вночі, 16,...|[київ, вночі, 16,...|
|http://k.img.com....|Потоп у Києві: ст...|уночі київ вкотре...|[уночі, київ, вко...|[уночі, київ, вко..

In [0]:
vectorizer = CountVectorizer(inputCol="lemmatized", outputCol="features")
bow_model = vectorizer.fit(lemmatized_df)
bow_df = bow_model.transform(lemmatized_df)

bow_df.select("id", "features").write.mode("overwrite").parquet("output/bow.parquet")

bow_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|                body|              tokens|     filtered_tokens|          lemmatized|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|http://k.img.com....|Кличко покликав н...|"київ - перспекти...|["київ, -, перспе...|["київ, -, перспе...|[", київ, -, перс...|(18652,[0,1,2,3,4...|
|http://k.img.com....|З'явилося відео, ...|"   з'явилося від...|[" , , з'явилося,...|[", з'явилося, ві...|[", з'явитися, ві...|(18652,[0,1,2,3,4...|
|http://k.img.com....|У центрі Києва по...|у києві на бессар...|[у, києві, на, бе...|[у, києві, на, бе...|[у, кий, на, бесс...|(18652,[0,1,2,4,6...|
|http://k.img.com....|Нічний ураган пер...|київ вночі 16 сер...|[київ, вночі, 16,...|[київ, вночі, 16,...|

In [0]:
idf = IDF(inputCol="features", outputCol="tfidf")
idf_model = idf.fit(bow_df)
tfidf_df = idf_model.transform(bow_df)

vocab = bow_model.vocabulary
def extract_top_words(tfidf_vector, vocab, top_n=10):
    indices = np.argsort(tfidf_vector.toArray())[::-1][:top_n]
    indices = [int(i) for i in indices]
    return [(vocab[i], float(tfidf_vector[i])) for i in indices]

extract_top_words_udf = udf(
    lambda vector: [(word, score) for word, score in extract_top_words(vector, vocab)],
    ArrayType(StructType([
        StructField("word", StringType(), True),
        StructField("score", FloatType(), True)
    ]))
)
top_words_df = tfidf_df.withColumn("top_words", extract_top_words_udf(col("tfidf")))

top_words_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|                body|              tokens|     filtered_tokens|          lemmatized|            features|               tfidf|           top_words|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|http://k.img.com....|Кличко покликав н...|"київ - перспекти...|["київ, -, перспе...|["київ, -, перспе...|[", київ, -, перс...|(18652,[0,1,2,3,4...|(18652,[0,1,2,3,4...|[{київ, 8.889948}...|
|http://k.img.com....|З'явилося відео, ...|"   з'явилося від...|[" , , з'явилося,...|[", з'явилося, ві...|[", з'явитися, ві...|(18652,[0,1,2,3,4...|(18652,[0,1,2,3,4...|[{водій, 18.88469...|
|http://k.img.com....|У центрі Києва по...|у 

In [0]:
def cosine_similarity(v1, v2):
    if v1 is None or v2 is None:
        return float(0)
    dot_product = np.dot(v1.toArray(), v2.toArray())
    norm_a = np.linalg.norm(v1.toArray())
    norm_b = np.linalg.norm(v2.toArray())
    return float(dot_product / (norm_a * norm_b))

cosine_similarity_udf = udf(cosine_similarity, FloatType())

tokenizer = Tokenizer(inputCol="title", outputCol="title_tokens")
tokenized_df = tokenizer.transform(df)

stopwords_remover = StopWordsRemover(inputCol="title_tokens", outputCol="filtered_title_tokens")
filtered_df = stopwords_remover.transform(tokenized_df)

def remove_empty_elements(text):
    return [word.strip() for word in text if word.strip()]

remove_empty_udf = udf(remove_empty_elements, ArrayType(StringType()))
filtered_df = filtered_df.withColumn("filtered_title_tokens", remove_empty_udf(col("filtered_title_tokens")))
lemmatized_df = filtered_df.withColumn("lemmatized", lemmatizer_udf(col("filtered_title_tokens")))

vectorized_title = bow_model.transform(lemmatized_df).withColumnRenamed("features", "title_features")
vectorized_body = tfidf_df.withColumnRenamed("features", "body_features").select("Id", "body_features")

similarity_df = vectorized_body.join(vectorized_title.select("Id", "title_features"), on="Id", how="inner")
similarity_df = similarity_df.withColumn("cosine_similarity", cosine_similarity_udf(col("title_features"), col("body_features")))

similarity_df.show()

+--------------------+--------------------+--------------------+-----------------+
|                  Id|       body_features|      title_features|cosine_similarity|
+--------------------+--------------------+--------------------+-----------------+
|http://k.img.com....|(18652,[0,1,2,3,4...|(18652,[17,206,35...|       0.18107149|
|http://k.img.com....|(18652,[0,1,2,3,4...|(18652,[0,2,6,33,...|       0.46589082|
|http://k.img.com....|(18652,[0,1,2,4,6...|(18652,[6,109,194...|       0.23643312|
|http://k.img.com....|(18652,[0,1,4,6,7...|(18652,[4,2176,28...|       0.31622776|
|http://k.img.com....|(18652,[0,1,2,4,6...|(18652,[6,8,19,36...|       0.16506252|
|http://k.img.com....|(18652,[0,1,2,3,4...|(18652,[2,5,6,143...|        0.2248595|
|http://k.img.com....|(18652,[0,1,2,3,4...|(18652,[6,7,364,7...|       0.42133242|
|http://k.img.com....|(18652,[0,1,2,4,5...|(18652,[6,364,687...|        0.5035088|
|http://k.img.com....|(18652,[0,1,2,3,4...|(18652,[3,6,364,1...|       0.65251666|
|htt

In [0]:
similarity_df.orderBy(desc("cosine_similarity")).show()

+--------------------+--------------------+--------------------+-----------------+
|                  Id|       body_features|      title_features|cosine_similarity|
+--------------------+--------------------+--------------------+-----------------+
|http://k.img.com....|(18652,[1,2,6,7,8...|       (18652,[],[])|              NaN|
|http://k.img.com....|(18652,[0,3,5,30,...|       (18652,[],[])|              NaN|
|http://k.img.com....|(18652,[0,1,4,5,8...|       (18652,[],[])|              NaN|
|http://k.img.com....|(18652,[3,4,5,603...|(18652,[3,5,100,1...|       0.87996995|
|http://k.img.com....|(18652,[0,1,2,3,4...|(18652,[2,3,4,12,...|       0.87368435|
|http://k.img.com....|(18652,[0,1,2,3,4...|(18652,[2,3,4,12,...|       0.87368435|
|http://k.img.com....|(18652,[0,1,2,3,4...|(18652,[2,3,4,12,...|       0.87368435|
|http://k.img.com....|(18652,[0,1,2,3,4...|(18652,[2,3,4,12,...|       0.87368435|
|http://k.img.com....|(18652,[3,4,1633,...|(18652,[3,6,1104,...|        0.8695387|
|htt

In [0]:
threshold = 0.7


cross_df = similarity_df.alias("df1").crossJoin(similarity_df.alias("df2"))
filtered_df = cross_df.filter(
    (col("df1.cosine_similarity") > threshold) & 
    (col("df2.cosine_similarity") > threshold) &
    (col("df1.Id") != col("df2.Id"))
)

duplicate_count = filtered_df.count()
print(f"Кількість дублів: {duplicate_count}")

Кількість дублів: 450
