In [1]:
!pip install matplotlib pyspark



In [2]:
!pip install chardet



In [3]:
!pip install arabic-reshaper python-bidi



In [4]:
import os
import pandas as pd
import numpy as np
import chardet
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from pyspark.sql import SparkSession
from sklearn.linear_model import LinearRegression
from matplotlib import font_manager
import arabic_reshaper
from bidi.algorithm import get_display

In [5]:
spark = SparkSession.builder \
    .appName("Article Category Trend Analysis") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.extraJavaOptions", "-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -Xloggc:/path/to/gc.log") \
    .getOrCreate()

24/10/15 20:30:39 WARN Utils: Your hostname, codespaces-cdb1ad resolves to a loopback address: 127.0.0.1; using 10.0.1.116 instead (on interface eth0)
24/10/15 20:30:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/15 20:30:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
csv_directory = ('../Dataset/')

In [7]:
all_data = pd.DataFrame()
failed_files = []

In [8]:
def detect_encoding(file_path):
    """Detect the encoding of a file."""
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(10000))  
    return result['encoding']

In [9]:
for filename in os.listdir(csv_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(csv_directory, filename)
        encoding = detect_encoding(file_path) 
        try:
            temp_data = pd.read_csv(file_path, encoding=encoding, on_bad_lines='skip')  # Skip bad lines
            all_data = pd.concat([all_data, temp_data], ignore_index=True)
            print(f"Loaded {filename} with encoding {encoding}")
        except Exception as e:
            print(f"Failed to load {filename} with detected encoding {encoding}: {e}")
            failed_files.append(filename) 

            try:
                temp_data = pd.read_csv(file_path, encoding='ISO-8859-1', on_bad_lines='skip')
                all_data = pd.concat([all_data, temp_data], ignore_index=True)
                print(f"Successfully loaded {filename} with fallback encoding ISO-8859-1")
            except Exception as e:
                print(f"Failed to load {filename} with fallback encoding: {e}")
                failed_files.append(filename) 

print("Data loaded:", all_data.shape)

24/10/15 20:30:55 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


Loaded part-00007-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00010-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00009-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00004-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00000-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00001-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00003-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00006-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00008-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00005-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00002-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encoding utf-8
Loaded part-00012-5e194d26-a8d2-4362-a3ba-8db470c839a8-c000.csv with encodin

In [10]:
if 'crawl_date' not in all_data.columns or 'categories' not in all_data.columns:
    print("Required columns are missing.")
else:
    all_data['crawl_date'] = pd.to_datetime(all_data['crawl_date'], errors='coerce')

    all_data['year'] = all_data['crawl_date'].dt.year

    category_counts = all_data.groupby(['year', 'categories']).size().reset_index(name='count')

In [11]:
predictions = []

for category in category_counts['categories'].unique():
    category_data = category_counts[category_counts['categories'] == category]

    if not category_data.empty:
        X = category_data['year'].values.reshape(-1, 1)
        y = category_data['count'].values

        model = LinearRegression()
        model.fit(X, y)

        # Predict future for the next 5 years
        max_year = int(category_data['year'].max())
        future_years = np.array(range(max_year + 1, max_year + 6)).reshape(-1, 1)
        future_predictions = model.predict(future_years)

        predictions.append((category, future_predictions.sum()))

predictions_df = pd.DataFrame(predictions, columns=['category', 'predicted_count'])


In [None]:
predictions_df['category'] = predictions_df['category'].str.split(';')
predictions_exploded = predictions_df.explode('category')

def reshape_arabic_text(text):
    try:
        return get_display(arabic_reshaper.reshape(text))
    except AssertionError:
        print(f"Error reshaping text: {text}")
        return text 

predictions_exploded['category'] = predictions_exploded['category'].astype(str)
predictions_exploded['category'] = predictions_exploded['category'].apply(reshape_arabic_text)

top_n = 10
category_counts = predictions_exploded['category'].value_counts().nlargest(top_n).reset_index()
category_counts.columns = ['category', 'predicted_count']

plt.rcParams['font.family'] = 'Arial'

plt.figure(figsize=(12, 8))
sns.set_style('whitegrid')

barplot = sns.barplot(x='predicted_count', y='category', data=category_counts, hue='category', palette='viridis', dodge=False, legend=False)

plt.title('Predicted Top 10 Article Counts by Category for the 5 Upcoming Years', fontsize=16, color='darkblue', weight='bold')
plt.xlabel('Predicted Article Count', fontsize=14, color='darkblue')
plt.ylabel('Categories', fontsize=14, color='darkblue')

for index, value in enumerate(category_counts['predicted_count']):
    barplot.text(value + 0.5, index, f'{value:.0f}', color='black', ha='left', va='center', fontsize=12)

plt.tight_layout()
plt.show()

print("Plotting complete. The predicted article counts per category have been visualized.")