# Datathon

## Variables

In [None]:
NUMBER_OF_ROWS         =     1000   # Number of rows to keep (faster execution)
MIN_SIMILARITY_COEF    =     0.99   # Minimal coefficent of similarity to keep similar words (between 0 and 1)

## Libraries

In [None]:
! pip install plotly
! pip install textblob_fr
! pip install spacy
! pip install graphviz

Collecting textblob_fr
  Downloading textblob_fr-0.2.0-py2.py3-none-any.whl (561 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.2/561.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: textblob_fr
Successfully installed textblob_fr-0.2.0


## Clone le répo git

[Repo Git setup SPARK](https://gist.github.com/javieraespinosa/0e30bccba30caf24042b7b189e4b4c36)

**NOTE:** Using Java 8 instead of 11 to be able too use sparknlp

In [None]:
import os

#------------------------------------------
# Versions
#------------------------------------------
SPARK_VERSION  = "3.0.0"
JAVA_VERSION   = "11"
AUT_VERSION    = "0.91.0"

GRAPHFRAME_VERSION = "0.8.2"
GRAPHFRAME_SCALA_VERSION = "2.12"

#------------------------------------------
# Folders
#------------------------------------------
APPS_HOME = "apps"
APPS_HOME = os.path.join(os.getcwd(), APPS_HOME)
!mkdir -p "$APPS_HOME"
!rm -rf sample_data   #remove colab default folder


#------------------------------------------
# JAVA JDK
#------------------------------------------
!sudo apt-get update
!sudo apt-get install -y openjdk-"$JAVA_VERSION"-jdk-headless
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-{}-openjdk-amd64".format(JAVA_VERSION)


#------------------------------------------
# SPARK
#------------------------------------------
!pip install "pyspark==$SPARK_VERSION" findspark
SPARK_HOME = !python -c "import pyspark as _; print(_.__path__)"
SPARK_HOME = SPARK_HOME[0][2:-2]
os.environ["SPARK_HOME"] = SPARK_HOME


#------------------------------------------
# ARCHIVES UNLEASHED TOOLKIT
#------------------------------------------
!wget https://github.com/archivesunleashed/aut/releases/download/aut-"$AUT_VERSION"/aut-"$AUT_VERSION".zip
!wget https://github.com/archivesunleashed/aut/releases/download/aut-"$AUT_VERSION"/aut-"$AUT_VERSION"-fatjar.jar
!mv aut-* "$APPS_HOME"


#------------------------------------------
# GRAPHFRAME lib
#------------------------------------------
GRAPHFRAME_SPARK_VERSION = "{}-spark{}-s_{}".format(GRAPHFRAME_VERSION, SPARK_VERSION[:-2], GRAPHFRAME_SCALA_VERSION)

!wget https://repos.spark-packages.org/graphframes/graphframes/"$GRAPHFRAME_SPARK_VERSION"/graphframes-"$GRAPHFRAME_SPARK_VERSION".jar
!jar -xf   graphframes-"$GRAPHFRAME_SPARK_VERSION".jar graphframes
!zip -q -r graphframes-"$GRAPHFRAME_SPARK_VERSION".zip graphframes
!rm -r graphframes
!mv graphframes-* "$APPS_HOME"


#------------------------------------------
# SPARK init
#------------------------------------------
import findspark

SPARK_DRIVER_MEMORY   = "8g"

JARS     = !find "$APPS_HOME" -maxdepth 1 -iname "*.jar"
PY_FILES = !find "$APPS_HOME" -maxdepth 1 -iname "*.zip"

os.environ['PYSPARK_SUBMIT_ARGS'] = "--driver-memory {} --jars {} --py-files {} pyspark-shell".format(
    SPARK_DRIVER_MEMORY,
    ",".join(JARS),
    ",".join(PY_FILES)
)

findspark.init()


#------------------------------------------
# SPARK session
#------------------------------------------
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark = SparkSession.builder.master("local[*]")\
                                  .getOrCreate()

# Backward compability with AUT toolkit
sqlContext = SQLContext(spark.sparkContext, sparkSession=spark)
sc = spark

0% [Working]            Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Connected to cloud0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Connected to cloud                                                                               Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [632 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [47.6 kB]
Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Hit

### Installing extenal libs



*   plotly pour l'affichage
*   textblob_fr pour l'analyse de sentiment de mots français


### Importer les librairies

In [None]:
import nltk
nltk.download("stopwords")
from nltk.corpus import words

from tkinter.constants import SEPARATOR
import plotly.express as px
import pandas as pd
from textblob_fr import PatternTagger, PatternAnalyzer
from textblob import TextBlob

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
! mkdir dictionnary
! wget https://openlexicon.fr/datasets-info/Liste-de-mots-francais-Gutenberg/liste.de.mots.francais.frgut.txt

--2023-11-30 21:15:22--  https://openlexicon.fr/datasets-info/Liste-de-mots-francais-Gutenberg/liste.de.mots.francais.frgut.txt
Resolving openlexicon.fr (openlexicon.fr)... 185.199.108.153
Connecting to openlexicon.fr (openlexicon.fr)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4058504 (3.9M) [text/plain]
Saving to: ‘liste.de.mots.francais.frgut.txt’


2023-11-30 21:15:23 (54.2 MB/s) - ‘liste.de.mots.francais.frgut.txt’ saved [4058504/4058504]



In [None]:
with open("liste.de.mots.francais.frgut.txt", 'r') as f:
    french_words = [line.strip() for line in f]
    f.close()

## Initialisation de SPARK

In [None]:
spark

In [None]:
! python3 -m spacy download fr_core_news_sm

2023-11-30 21:15:31.953122: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-30 21:15:31.953204: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-30 21:15:31.953257: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-30 21:15:31.982119: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting fr-core-news-sm==3.6.0
  Downloading ht

In [None]:
from pyspark.ml.feature import StopWordsRemover, Tokenizer, RegexTokenizer,IDF,Word2Vec,CountVectorizer
from pyspark.sql.types import ArrayType, StringType, FloatType, IntegerType
from pyspark.ml.linalg import Vectors
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml import Pipeline, Transformer
from pyspark.sql.functions import desc, col, udf, concat_ws, rand
from pyspark.sql.types import StringType
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.clustering import LDA
from collections import Counter

import spacy
from spacy import displacy
nlp = spacy.load("fr_core_news_sm")

import numpy as np
import graphviz

### Fonctions de traitement

In [None]:
def wordInDict(text) -> str:
  retStr = ""

  tempList = text.split()

  for mot in tempList:
    if(mot in french_words):
      retStr += mot+" "

  return retStr

def analyze_sentiment(text):
    blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
    return blob.sentiment[0]

def remove_duplicates(text):
    words = text.split()
    words = list(set(words))
    text = ' '.join(words)
    return text

def categoriseText(text) -> list:
  doc = nlp(remove_duplicates(text))
  return [str(entity.text +" - "+ entity.label_) for entity in doc.ents]

def cosine_similarity(v1, v2):
    return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

# Define a UDF to find the most frequent word
def most_frequent(wordList):
  if(wordList):
    return Counter(wordList).most_common(1)[0][0]
  else:
    return "null"

# Define a UDF to find the most frequent word
def convertToColor(inputString):
  tempFloat = float(inputString)
  tempFloat += 0.5
  contrTempFloat = 1 - tempFloat
  text = "{color1:.2f} {color2:.2f} 1.000"
  return text.format(color1=contrTempFloat,color2=tempFloat)

def getURLDomain(url):
    urlList = url.split('/')
    return urlList[2]

def getListLen(inputList):
  return len(inputList)

count_words_udf = udf(getListLen, IntegerType())

convert_to_color = udf(convertToColor, StringType())
get_url_domain_name = udf(getURLDomain, StringType())
most_frequent_udf = udf(most_frequent, StringType())
cosine_similarity_udf = udf(cosine_similarity, FloatType())
sentiment_udf = udf(analyze_sentiment, StringType())
word_in_dict_udf = udf(wordInDict, StringType())
cat_url_udf = udf(wordInDict, ArrayType(StringType()))
categories_text = udf(categoriseText,ArrayType(StringType()))

### Category functions

In [None]:
# Champ lexical : Musique
lexique_musique = ['mélodie', 'rythme', 'partition', 'harmonie', 'accord', 'instrument', 'symphonie', 'sonate', 'crescendo', 'soliste', 'orchestre', 'tempo', 'harmonica', 'improvisation', 'solfège', 'compositrice', 'cadence', 'concerto']

# Champ lexical : Sport
lexique_sport = ['athlète', 'compétition', 'score', 'entraînement', 'équipe', 'victoire', 'défaite', 'entraîneur', 'athlétisme', 'stratégie', 'arbitre', 'record', 'médaille', 'endurance', 'fair-play', 'sélection', 'dopage', 'marathon']

# Champ lexical : Politique
lexique_politique = ['politicien', 'élection', 'gouvernement', 'démocratie', 'législation', 'diplomatie', 'citoyenneté', 'débat', 'parti politique', 'constitution', 'lobbying', 'réforme', 'manifestation', 'coalition', 'discours', 'droits de l\'homme']

# Champ lexical : Gastronomie
lexique_gastronomie = ['cuisine', 'recette', 'ingrédient', 'saveur', 'gourmet', 'chef', 'dégustation', 'épices', 'plat', 'gastronome', 'cuisinier', 'menu', 'assaisonnement', 'gourmandise', 'appétit']

# Champ lexical : Voyage
lexique_voyage = ['destination', 'aventure', 'exploration', 'itinéraire', 'voyageur', 'excursion', 'découverte', 'routard', 'paysage', 'culture', 'tourisme', 'séjour', 'évasion', 'hébergement', 'guide', 'aventureux']
# Lexique Société
lexique_societe = ['société', 'communauté', 'citoyen', 'norme', 'coutume', 'valeur', 'tradition', 'éthique', 'diversité', 'inclusion', 'minorité', 'majorité', 'identité', 'citoyenneté', 'engagement', 'solidarité', 'harmonie', 'justice', 'responsabilité', 'participation']

# Lexique Nature
lexique_nature = ['nature', 'environnement', 'écologie', 'biodiversité', 'écosystème', 'climat', 'faune', 'flore', 'forêt', 'océan', 'montagne', 'développement durable', 'conservation', 'énergie renouvelable', 'pollution', 'réchauffement climatique', 'sustainability', 'recyclage', 'agriculture durable']

# Lexique Technologie
lexique_technologie = ['technologie', 'innovation', 'numérique', 'intelligence artificielle', 'robotique', 'internet', 'programmation', 'algorithmes', 'cybersécurité', 'réseaux sociaux', 'télécommunications', 'géolocalisation', 'biotechnologie', 'nanotechnologie', 'realité virtuelle', 'cloud computing', 'smartphones', 'automatisation', 'drones', 'impression 3D']

# Lexique Culture
lexique_culture = ['culture', 'tradition', 'coutume', 'art', 'musée', 'littérature', 'cinéma', 'théâtre', 'danse', 'musique', 'cuisine', 'religion', 'festivals', 'cérémonie', 'langue', 'folklore', 'patrimoine', 'identité culturelle', 'diversité culturelle']

# Lexique Économie
lexique_economie = ['économie', 'commerce', 'entreprise', 'marché', 'produit intérieur brut (PIB)', 'investissement', 'commerce international', 'consommation', 'finance', 'banque', 'monnaie', 'pauvreté', 'chômage', 'croissance économique', 'développement économique', 'inflation', 'dette', 'budget', 'fiscalité']

# Lexique Science
lexique_science = ['science', 'recherche', 'scientifique', 'expérience', 'hypothèse', 'théorie', 'méthode scientifique', 'biologie', 'physique', 'chimie', 'astronomie', 'géologie', 'mathématiques', 'informatique', 'médecine', 'psychologie', 'écologie', 'génétique', 'évolution']

# Lexique Relations
lexique_relations = ['relations', 'communication', 'interpersonnel', 'confiance', 'respect', 'coopération', 'diplomatie', 'négociation', 'collaboration', 'compromis', 'médiation', 'tolérance', 'intimité', 'amitié', 'amour', 'conflit', 'stress', 'émotion', 'bienveillance', 'éthique relationnelle']

# Lexique Histoire
lexique_histoire = ['histoire', 'événement', 'période', 'révolution', 'guerre', 'paix', 'civilisation', 'archéologie', 'époque', 'chronologie', 'historien', 'patrimoine', 'document', 'archive', 'mémoire collective', 'colonisation', 'indépendance', 'héritage', 'histoire mondiale']

# Lexique Environnement
lexique_environnement = ['environnement', 'écologie', 'biodiversité', 'écosystème', 'climat', 'faune', 'flore', 'forêt', 'océan', 'montagne', 'développement durable', 'conservation', 'énergie renouvelable', 'pollution', 'réchauffement climatique', 'sustainability', 'recyclage', 'agriculture durable', 'éco-responsabilité']

# Lexique Politique (déjà défini)

# Lexique Éducation
lexique_education = ['éducation', 'école', 'enseignement', 'professeur', 'élève', 'apprentissage', 'connaissance', 'cours', 'matière', 'éducation supérieure', 'diplôme', 'examen', 'réussite', 'échec', 'pédagogie', 'salle de classe', 'éducatif', 'compétence', 'formation']

# Lexique Santé
lexique_sante = ['santé', 'bien-être', 'maladie', 'médecine', 'système de santé', 'médecin', 'infirmière', 'prévention', 'traitement', 'nutrition', 'hygiène', 'activité physique', 'psychologie', 'santé mentale', 'maladie chronique', 'pharmacie', 'vaccination', 'santé publique', 'accès aux soins']

# Lexique Art
lexique_art = ['art', 'œuvre', 'artiste', 'créativité', 'expression', 'beauté', 'culture artistique', 'peinture', 'sculpture', 'photographie', 'architecture', 'musée', 'cinéma', 'théâtre', 'danse', 'musique', 'design', 'arts visuels', 'performances']


# Lexique Innovation
lexique_innovation = ['innovation', 'technologie', 'créativité', 'invention', 'recherche', 'développement', 'progrès', 'start-up', 'entrepreneuriat', 'idée novatrice', 'disruption', 'avancée', 'nouveau', 'efficacité', 'solutions innovantes', 'changements', 'inventeur', 'scientifique', 'révolution']

# Lexique Éthique
lexique_ethique = ['éthique', 'morale', 'valeurs', 'intégrité', 'responsabilité', 'respect', 'droits', 'justice', 'équité', 'principes', 'décision éthique', 'conscience', 'comportement éthique', 'bienséance', 'normes morales', 'devoir', 'honnêteté', 'dignité', 'transparence']

# Liste des thèmes et leurs mots respectifs
themes = {
    'Musique': lexique_musique,
    'Sport': lexique_sport,
    'Politique': lexique_politique,
    'Gastronomie': lexique_gastronomie,
    'Voyage': lexique_voyage,
    'Nature': lexique_nature,
    'Technologie': lexique_technologie,
    'Culture': lexique_culture,
    'Economie': lexique_economie,
    'Science': lexique_science,
    'Relations': lexique_relations,
    'Histoire': lexique_histoire,
    'Environnement': lexique_environnement,
    'Education': lexique_education,
    'Sante': lexique_sante,
    'Art': lexique_art,
    'Innovation': lexique_innovation,
    'Ethique': lexique_ethique
}

# Liste des keywords à rechercher
keywords = ['Musique', 'Sport', 'Politique', 'Gastronomie', 'Voyage', 'Nature', 'Technologie', 'Culture', 'Economie', 'Science', 'Relations', 'Histoire', 'Environnement', 'Politique', 'Education', 'Sante', 'Art', 'Innovation', 'Ethique']

def countTheme(Row):
  listeTheme=[]
  for theme in keywords:
    print(theme)
    Occurance = 0
    for word in Row:
      if word in themes[theme]:
        Occurance += 1
    listeTheme.append(Occurance)
  return listeTheme

count_theme_udf = udf(countTheme, ArrayType(IntegerType()))

### Import du WARC

In [None]:
%%capture
DIR="LIFRANUM"
!mkdir -p $DIR

#!gsutil -m cp -r gs://cpe-lyon/LIFRANUM/autre $DIR
#!gsutil -m cp -r gs://cpe-lyon/LIFRANUM/cartoweb $DIR
#!gsutil -m cp -r gs://cpe-lyon/LIFRANUM/lifranum-method $DIR
!gsutil -m cp -r gs://cpe-lyon/LIFRANUM/repo-ecritures-num $DIR

In [None]:
from aut import *

WARCs_path = "LIFRANUM/repo-ecritures-num/*.warc*"

## Récupération des données

### Métadonnées

In [None]:
df_all = WebArchive(sc, sqlContext, WARCs_path).all()
df_all = df_all.filter(df_all['http_status_code'] == 200)
df_all = df_all.withColumnRenamed('content', 'content2').withColumnRenamed('url', 'url2')

### Contenu de la page

In [None]:
df_webpages = WebArchive(sc, sqlContext, WARCs_path).webpages()
df_webpages = df_webpages.filter(df_webpages['language'] == 'fr')
df_webpages.cache()

DataFrame[crawl_date: string, url: string, mime_type_web_server: string, mime_type_tika: string, language: string, content: string]

### Jointure sur l'url des deux tables

In [None]:
df_joined = df_all.join(df_webpages, df_all['url2'] == df_webpages['url'])

### Limiting the number of rows to accelerate tests

Set **NUMBER_OF_ROWS** at the start to make it faster.

In [None]:
df_joined = df_joined.orderBy(rand()).limit(NUMBER_OF_ROWS)

### Adding an ID to facilitate recognising different entities

In [None]:
df_joined = df_joined.withColumn("id", monotonically_increasing_id())

In [None]:
df_text = df_joined.withColumn("text", remove_html( remove_http_header("content2") )).withColumn("domainName",get_url_domain_name(df_joined["url"])).select("id","domainName","text").withColumnRenamed("domainName","url")

In [None]:
df_text.cache()

DataFrame[id: bigint, url: string, text: string]

### Filtrage des données

### Ajout des stopword français

In [None]:
stopwordList = nltk.corpus.stopwords.words('french')
for char in "abcdefghijklmnopqrstuvwxyzAZERTYUIOPQSDFGHJKLMWXCVBN1234567890":
  stopwordList.append(char)


### Application d'un tokenizer + remover via un pipeline

In [None]:
regTokernizer = RegexTokenizer(inputCol="text", outputCol="words",pattern="\\W+",gaps=True)
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words",stopWords=stopwordList)

#word2Vec = Word2Vec(vectorSize=10, inputCol="filtered_words", outputCol="vector")
#cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")
#idf = IDF(inputCol="rawFeatures", outputCol="features")

### Création du pipeline et application sur le dataframe

In [None]:
pipeline = Pipeline(stages=[regTokernizer,remover])
# word2Vec
model = pipeline.fit(df_text)

df_transformed = model.transform(df_text)

In [None]:
df_transformed.show(10)

+---+--------------------+--------------------+--------------------+--------------------+
| id|                 url|                text|               words|      filtered_words|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|       retailles.com|3f8 décembre | 20...|[3f8, d, cembre, ...|[3f8, cembre, 201...|
|  1|www.lionel-seppol...|3a  17 Vigie, fév...|[3a, 17, vigie, f...|[3a, 17, vigie, v...|
|  2|yvonpare.blogspot...|353e  Littérature...|[353e, litt, ratu...|[353e, litt, ratu...|
|  3|yvonpare.blogspot...|35be  Littérature...|[35be, litt, ratu...|[35be, litt, ratu...|
|  4|www.lionel-seppol...|16  Vigie, octobr...|[16, vigie, octob...|[16, vigie, octob...|
|  5|lebathyscaphe.blo...|424c Le Bathyscap...|[424c, le, bathys...|[424c, bathyscaph...|
|  6|yvonpare.blogspot...|3534  Littérature...|[3534, litt, ratu...|[3534, litt, ratu...|
|  7|yvonpare.blogspot...|352e  Littérature...|[352e, litt, ratu...|[352e, litt, ratu...|
|  8|lesma

### Transformations additionelles du DF



*   Verification que les mots sont bien dans le dictionnaire
*   Analyse des Potentielles catégories de chaque terme



In [None]:
df_transformed.drop(df_transformed.text).drop(df_transformed.words)

DataFrame[id: bigint, url: string, filtered_words: array<string>]

1. Nettoyage avec dictionnaire
2. Comptage des mots
3. Assignation à des categories
4. Ajout de sentiments

In [None]:
df_transformed = df_transformed.withColumn("clean_text_in_dict",word_in_dict_udf(concat_ws(" ",df_transformed["filtered_words"])))
#df_transformed = df_transformed.withColumn("categories", categories_text(df_transformed["clean_text_in_dict"]))
#df_transformed = df_transformed.withColumn("most_frequent_word", most_frequent_udf(df_transformed["categories"]))
df_transformed = df_transformed.withColumn("count_clean_words", count_words_udf(df_transformed["clean_text_in_dict"]))
df_transformed = df_transformed.withColumn("categories", count_theme_udf(df_transformed["filtered_words"]))
df_transformed = df_transformed.withColumn("sentiment", sentiment_udf(df_transformed["clean_text_in_dict"]))

df_transformed.cache()

DataFrame[id: bigint, url: string, text: string, words: array<string>, filtered_words: array<string>, clean_text_in_dict: string, count_clean_words: int, categories: array<int>, sentiment: string]

In [None]:
df_transformed.show(10, False)

## Export du code vers Pandas pour affichage

In [None]:
df_export = df_transformed.select("id","url","count_clean_words","categories","sentiment")

In [None]:
df_export.cache()
#df_export.show(10,True)

In [None]:
pandasSentiment = df_export.toPandas()

In [None]:
pandasSentiment.to_csv("result.csv")

In [None]:
pandasSentiment['sentiment'] = pandasSentiment['sentiment'].apply(pd.to_numeric, errors='coerce')
pandasSentiment["roundSentiment"] = round(pandasSentiment["sentiment"], 3)

In [None]:
pandasSentiment.head(20)

In [None]:
pandasSentiment.size

In [None]:
pandasSentimentTContent = pandasSentiment[pandasSentiment["roundSentiment"] > 0.15]
pd.set_option('max_colwidth', None)

In [None]:
#  pandasSentimentTContent.head()

In [None]:
fig = px.histogram(pandasSentiment.sort_values(by="roundSentiment",ascending=True),x="roundSentiment")

fig.update_traces(xbins=dict( # bins used for histogram
        start=-0.4,
        end=0.5,
        size=0.03
    ))

fig.show()

## Similarities

### Cré

In [None]:
dot = graphviz.Digraph('similarities', comment='Subject and emotions')

In [None]:
nodes = df_sentiment.select("id","url","most_frequent_word","sentiment")

In [None]:
nodes.dtypes

In [None]:
nodes = nodes.withColumn("color",convert_to_color(nodes["sentiment"]))

In [None]:
nodes.columns

In [None]:
for row in nodes.collect():
  dot.node(str(row.id),row.url+" : "+row.most_frequent_word,color=row.color,style='filled',fillcolor=row.color)

In [None]:
df = df_sentiment.select("id","vector")
df2 = df.withColumnRenamed("id","id2").withColumnRenamed("vector","vector2")

In [None]:
# Assume that df is a DataFrame with a column "features" that contains the vectors
df = df.join(df2, df["id"] != df2["id2"])

In [None]:
df.show(10,True)

In [None]:
# Calculate the cosine similarity between each pair of vectors
df = df.withColumn("similarity", cosine_similarity_udf("vector", "vector2"))

In [None]:
#df.show(10,True)

In [None]:
# Find the most similar vector for each vector
#df = df.groupBy("i.id").agg({"similarity": "max"})

In [None]:
# Create the edge DataFrame
edges = df.filter(df["similarity"] > MIN_SIMILARITY_COEF).select("id","id2","similarity").withColumnRenamed("id","src").withColumnRenamed("id2","dst")

In [None]:
edges.show(10,True)

In [None]:
for row in edges.collect():
  if(row.dst > row.src):
    dot.edge(str(row.src),str(row.dst),dir="both")
  # ,label=str(row.similarity)

In [None]:
dot

In [None]:
dot.render(filename='g1')