In [41]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [42]:
# Step 1: Initialize SparkSession
# spark.stop()    
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()


# Step 2: Read CSV File
file_path_train = "./combined_data_2018_2023.csv"  # Replace with your file path
file_path_test = "./final_data_prep_2024.csv"  # Replace with your file path
df_train = spark.read.csv(file_path_train, header=True, inferSchema=True)
df_test = spark.read.csv(file_path_test, header=True, inferSchema=True)

print("Train")
df_train.show(5)

print("Test")
df_test.show(5) 

Train
+----+-------+--------------+-------------------+
|year|country|article_amount|subject_area_abbrev|
+----+-------+--------------+-------------------+
|2023|  Spain|          68.0|               ECON|
|2023|  Spain|          68.0|               ECON|
|2023|  India|         190.0|               VETE|
|2023|  India|         190.0|               VETE|
|2023|  India|         190.0|               VETE|
+----+-------+--------------+-------------------+
only showing top 5 rows

Test
+----+-------+--------------+-------------------+
|year|country|article_amount|subject_area_abbrev|
+----+-------+--------------+-------------------+
|2024|   Iran|          50.0|               AGRI|
|2024|   Iran|          50.0|               AGRI|
|2024|   Iran|          50.0|               AGRI|
|2024|   Iran|          50.0|               AGRI|
|2024|   Iran|          50.0|               AGRI|
+----+-------+--------------+-------------------+
only showing top 5 rows



In [43]:
print("2018-2023 : ")
for column in df_train.columns:
    unique_count = df_train.select(column).distinct().count()
    print(f"Unique values in column '{column}': {unique_count}")

2018-2023 : 
Unique values in column 'year': 6
Unique values in column 'country': 175
Unique values in column 'article_amount': 467
Unique values in column 'subject_area_abbrev': 27


In [44]:
print("2024 :")
for column in df_test.columns:
    unique_count = df_test.select(column).distinct().count()
    print(f"Unique values in column '{column}': {unique_count}")

2024 :
Unique values in column 'year': 1
Unique values in column 'country': 18
Unique values in column 'article_amount': 83
Unique values in column 'subject_area_abbrev': 27


In [45]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName("DataSciencePipeline").getOrCreate()

try:
    # Read training and testing datasets without dropping duplicates
    df_train = spark.read.csv(file_path_train, header=True, inferSchema=True)
    # df_train = df_train.dropDuplicates()
    df_test = spark.read.csv(file_path_test, header=True, inferSchema=True)
    # df_test = df_test.dropDuplicates()
except Exception as e:
    print(f"Error loading files: {e}")
    spark.stop()
    raise

# Print schemas to confirm structure
print("Training Data Schema:")
df_train.printSchema()
print("Testing Data Schema:")
df_test.printSchema()

# Ensure critical columns are present
required_columns = ["year", "article_amount", "subject_area_abbrev", "country"]
missing_columns_train = [col for col in required_columns if col not in df_train.columns]
missing_columns_test = [col for col in required_columns if col not in df_test.columns]

if missing_columns_train or missing_columns_test:
    raise ValueError(f"Missing required columns. Train: {missing_columns_train}, Test: {missing_columns_test}")

# Handle missing values
df_train = df_train.dropna(subset=required_columns)
df_test = df_test.dropna(subset=required_columns)

# Normalize country column to avoid mismatches
from pyspark.sql import functions as F
df_train = df_train.withColumn("country", F.trim(F.lower(F.col("country"))))
df_test = df_test.withColumn("country", F.trim(F.lower(F.col("country"))))

# Filter datasets to have only common countries between train and test
common_countries = df_train.select("country").distinct().intersect(df_test.select("country").distinct())
df_train = df_train.join(common_countries, on="country", how="inner")
df_test = df_test.join(common_countries, on="country", how="inner")

# Handle categorical features
subject_area_indexer = StringIndexer(inputCol="subject_area_abbrev", outputCol="subject_area_indexed", handleInvalid="skip")
country_indexer = StringIndexer(inputCol="country", outputCol="country_indexed", handleInvalid="skip")

# Assemble feature columns
feature_columns = ["year", "article_amount", "subject_area_indexed"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# RandomForestClassifier setup
rf = RandomForestClassifier(labelCol="country_indexed", featuresCol="features", probabilityCol="probability")

# Create the pipeline
pipeline = Pipeline(stages=[subject_area_indexer, country_indexer, assembler, rf])

# Train the model
model = pipeline.fit(df_train)

# Make predictions
df_test_predictions = model.transform(df_test)

# Display predictions
df_test_predictions.select(
    "country", "article_amount", "subject_area_abbrev", "features",
    "country_indexed", "prediction", "probability"
).show(truncate=False)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="country_indexed", predictionCol="prediction")

# Calculate accuracy and F1 score
accuracy = evaluator.evaluate(df_test_predictions, {evaluator.metricName: "accuracy"})
print(f"Model Accuracy: {accuracy:.2f}")

f1_score = evaluator.evaluate(df_test_predictions, {evaluator.metricName: "f1"})
print(f"Model F1 Score: {f1_score:.2f}")

# Evaluate precision and recall
for metric in ["weightedPrecision", "weightedRecall"]:
    score = evaluator.evaluate(df_test_predictions, {evaluator.metricName: metric})
    print(f"{metric.capitalize()}: {score:.2f}")

# Stop the Spark session
spark.stop()


Training Data Schema:
root
 |-- year: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- article_amount: double (nullable = true)
 |-- subject_area_abbrev: string (nullable = true)

Testing Data Schema:
root
 |-- year: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- article_amount: double (nullable = true)
 |-- subject_area_abbrev: string (nullable = true)

+---------+--------------+-------------------+------------------+---------------+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|country  |article_amount|subject_area_abbrev|features          |country_indexed|prediction|probability                                    