In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, desc
import pymongo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Création de la session Spark
spark = SparkSession.builder \
    .appName("MongoDB_Analysis") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.4.1") \
    .config("spark.mongodb.read.connection.uri", "mongodb://mongodb:27017/github_issues") \
    .getOrCreate()

# Lecture des issues depuis MongoDB
df_issues = spark.read.format("mongodb") \
    .option("database", "github_issues") \
    .option("collection", "closed_issues") \
    .load()

# Afficher un aperçu des données
df_issues.show(5)

# Compter le nombre total d'issues
issue_count = df_issues.count()
print(f"Nombre total d'issues: {issue_count}")

# Nombre d'issues par langage
df_language_count = df_issues.groupBy("language").agg(count("issue_id").alias("count")).orderBy(desc("count"))
df_language_count.show()

# Durée moyenne de fermeture des issues par langage
df_duration_avg = df_issues.groupBy("language").agg(avg("duration").alias("avg_duration")).orderBy(desc("avg_duration"))
df_duration_avg.show()

# Arrêter la session Spark
spark.stop()


:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-29908b01-3a2f-4fa4-9cd0-47c12d616dde;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;10.4.1 in central
	found org.mongodb#mongodb-driver-sync;5.1.4 in central
	[5.1.4] org.mongodb#mongodb-driver-sync;[5.1.1,5.1.99)
	found org.mongodb#bson;5.1.4 in central
	found org.mongodb#mongodb-driver-core;5.1.4 in central
	found org.mongodb#bson-record-codec;5.1.4 in central
downloading https://repo1.maven.org/maven2/org/mongodb/spark/mongo-spark-connector_2.12/10.4.1/mongo-spark-connector_2.12-10.4.1.jar ...
	[SUCCESSFUL ] org.mongodb.spark#mongo-spark-connector_2.12;10.4.1!mongo-spark-connector_2.12.jar (150ms)
downloading https://repo1.maven.org/maven2/org/mongodb/mongodb-driver-sync/5.1.4/mongodb-driver-sync-5.1.4.jar ...
	[SUCCESSFUL ] org.

+--------------------+--------------------+-------------------+-------------------+--------+----------+--------+-----+------+--------------------+
|                 _id|                body|          closed_at|         created_at|duration|  issue_id|language|stars| state|               title|
+--------------------+--------------------+-------------------+-------------------+--------+----------+--------+-----+------+--------------------+
|67b24e4fd9191f330...|libpcap returned ...|2023-02-08 00:00:00|2023-02-06 00:00:00|       1|1573263352|    Rust|21527|closed|Errors occured wh...|
|67b24e4fd9191f330...|thread 'thread_wr...|2023-02-25 00:00:00|2023-01-05 00:00:00|       8|1520037594|    Rust|21527|closed|thread 'thread_wr...|
|67b24e4fd9191f330...|installing with n...|2023-05-24 00:00:00|2022-12-31 00:00:00|       5|1515053158|    Rust|21527|closed|unable to install...|
|67b24e4fd9191f330...|"Open full report...|2023-02-08 00:00:00|2022-12-22 00:00:00|       1|1507660835|    Rust|21527|

                                                                                

Nombre total d'issues: 63126


                                                                                

+----------------+-----+
|        language|count|
+----------------+-----+
|      JavaScript|28369|
|               C|22401|
|      TypeScript| 3806|
|            Dart| 1785|
|            Rust| 1749|
|             MDX| 1577|
|          Svelte| 1174|
|           Swift|  573|
|             Lua|  521|
|            Java|  470|
|Jupyter Notebook|  388|
|          Python|  235|
|                |   78|
+----------------+-----+

+----------------+------------------+
|        language|      avg_duration|
+----------------+------------------+
|                | 4.602564102564102|
|             MDX| 4.585922637920102|
|            Dart| 4.572549019607843|
|          Python|4.5574468085106385|
|            Rust| 4.543739279588336|
|          Svelte| 4.541737649063032|
|      TypeScript|  4.51602732527588|
|      JavaScript| 4.512108287214918|
|Jupyter Notebook| 4.494845360824742|
|               C|4.4901120485692605|
|            Java| 4.446808510638298|
|             Lua| 4.357005758157389|
|   