In [29]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
from functools import reduce
import re
import json
import numpy as np
import faiss
from textblob import TextBlob

from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col, count, avg, sum as spark_sum, when
from pyspark.sql.functions import lit, to_date, col, concat_ws

from sentence_transformers import SentenceTransformer
import nltk
nltk.download('vader_lexicon') # Download vader_lexicon
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sherlockpi/nltk_data...


# Data Transformation

## 1. Load into Spark

Start a Spark session for data transformation

In [2]:
spark = SparkSession.builder \
    .appName("UbisoftDataTransform") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/06 15:24:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Load the various parquet files into Spark:

In [4]:
df_stock           = spark.read.parquet("data/ubisoft_stock.parquet")
df_news            = spark.read.parquet("data/ubisoft_news.parquet")
df_steam_reviews   = spark.read.parquet("data/steam_reviews.parquet")
df_reddit_posts    = spark.read.parquet("data/acshadows_reddit_posts.parquet")
df_reddit_comments = spark.read.parquet("data/acshadows_reddit_comments.parquet")

## 2. Fact Table: daily_summary

Standardise all datetime columns to join and aggregate by date later.

In [5]:
df_stock = df_stock.withColumn("date", to_date(col("Date")))  

# For df_news: convert the 'date' column (currently string) to a date type.
df_news = df_news.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))

# For df_steam_reviews: convert review_date to a date.
df_steam_reviews = df_steam_reviews.withColumn("date", to_date(col("review_date")))

# For df_reddit_posts: convert created_date to a date.
df_reddit_posts = df_reddit_posts.withColumn("date", to_date(col("created_date")))

# For df_reddit_comments: convert comment_created_date to a date.
df_reddit_comments = df_reddit_comments.withColumn("date", to_date(col("comment_created_date")))

Aggregate each data source by date.

In [6]:
# Aggregate Stock Data
df_stock_daily = df_stock.select("date", "Open", "Close", "Volume")

# Aggregate Steam Reviews: count reviews, average playtime, and calculate % positive reviews
df_reviews_daily = df_steam_reviews.groupBy("date").agg(
    count("*").alias("num_reviews"),
    avg("playtime_hours").alias("avg_playtime_hours"),
    (spark_sum(when(col("voted_up") == True, 1).otherwise(0)) / count("*")).alias("percent_positive")
)

# Aggregate Reddit Posts: count posts, and average score
df_reddit_posts_daily = df_reddit_posts.groupBy("date").agg(
    count("*").alias("num_reddit_posts"),
    avg("score").alias("avg_reddit_score")
)

# Aggregate Reddit Comments: count daily comments
df_reddit_comments_daily = df_reddit_comments.groupBy("date").agg(
    count("*").alias("num_reddit_comments")
)

# News – count the number of news articles per day
df_news_daily = df_news.groupBy("date").agg(
    count("*").alias("num_news_articles")
)

Join DataFrames: join all these daily aggregates on the common key (date) to produce a unified view (the fact table).

In [7]:
# Create a list of dataframes to join (using a full outer join to include all dates)
dfs = [df_stock_daily, df_reviews_daily, df_reddit_posts_daily, df_reddit_comments_daily, df_news_daily]

# Reduce the list with successive joins on the "date" column
df_unified = reduce(lambda df1, df2: df1.join(df2, on="date", how="full"), dfs)

# Select and rename columns to match the designed schema
df_unified = df_unified.select(
    "date",
    col("Open").alias("stock_open"),
    col("Close").alias("stock_close"),
    col("Volume").alias("stock_volume"),
    "num_reviews",
    "avg_playtime_hours",
    "percent_positive",
    "num_reddit_posts",
    "avg_reddit_score",
    "num_reddit_comments",
    "num_news_articles"
)

In [12]:
# Show the first 10 rows of the unified dataset
df_unified.orderBy(col("date").desc()).show(10, truncate=False)
df_unified.printSchema()

# Write the unified dataset to a parquet file for further analysis
df_unified.write.mode("overwrite").parquet("data/unified_dataset.parquet")

                                                                                

+----------+------------------+------------------+------------+-----------+------------------+------------------+----------------+------------------+-------------------+-----------------+
|date      |stock_open        |stock_close       |stock_volume|num_reviews|avg_playtime_hours|percent_positive  |num_reddit_posts|avg_reddit_score  |num_reddit_comments|num_news_articles|
+----------+------------------+------------------+------------+-----------+------------------+------------------+----------------+------------------+-------------------+-----------------+
|2025-04-05|NULL              |NULL              |NULL        |34         |38.470588235294116|0.7647058823529411|NULL            |NULL              |103                |NULL             |
|2025-04-04|10.194999694824219|9.54800033569336  |1626113     |120        |38.11424999999999 |0.825             |NULL            |NULL              |601                |NULL             |
|2025-04-03|10.399999618530273|10.345000267028809|765150    

## 3. Text Table: textual_context (for NLP / RAG)

Create a unified text table.

In [9]:
# Steam Reviews
df_steam_text = df_steam_reviews.select(
    to_date(col("review_date")).alias("date"),
    lit("steam_review").alias("source"),
    col("review").alias("content"),
    col("review_id").alias("id"),
    lit(None).cast("string").alias("url")
).where(col("review").isNotNull())

# Reddit Posts
df_reddit_posts_text = df_reddit_posts.select(
    to_date(col("created_date")).alias("date"),
    lit("reddit_post").alias("source"),
    concat_ws("\n", col("title"), col("selftext")).alias("content"),  # combine title + body
    col("id").alias("id"),
    col("url")
).where(col("title").isNotNull() | col("selftext").isNotNull())

# Reddit Comments
df_reddit_comments_text = df_reddit_comments.select(
    to_date(col("comment_created_date")).alias("date"),
    lit("reddit_comment").alias("source"),
    col("comment_body").alias("content"),
    col("comment_id").alias("id"),
    lit(None).cast("string").alias("url")
).where(col("comment_body").isNotNull())

# Ubisoft News
df_news_text = df_news.select(
    to_date(col("date")).alias("date"),
    lit("ubisoft_news").alias("source"),
    col("headline").alias("content"),
    col("headline").alias("id"),  # using headline as ID (or hash if you prefer)
    lit("https://news.ubisoft.com/en-gb/").alias("url")
).where(col("headline").isNotNull())

In [None]:
# Union all
df_textual_context = df_steam_text.unionByName(df_reddit_posts_text)\
                                  .unionByName(df_reddit_comments_text)\
                                  .unionByName(df_news_text)

# Preview
df_textual_context.show(5, truncate=80)
df_textual_context.printSchema()\

# Save to parquet
df_textual_context.write.mode("overwrite").parquet("data/textual_context.parquet")

+----------+------------+--------------------------------------------------------------------------------+---------+----+
|      date|      source|                                                                         content|       id| url|
+----------+------------+--------------------------------------------------------------------------------+---------+----+
|2025-04-05|steam_review|                                                                        Its fun.|192017461|NULL|
|2025-04-05|steam_review|                                                    "same shit different toilet"|192017163|NULL|
|2025-04-05|steam_review|I was a huge ac fan all the way back from AC 2. But some how starting from th...|192017026|NULL|
|2025-04-05|steam_review|very good, Assassins creed in Japan has been a long awaited installment in th...|192016829|NULL|
|2025-04-05|steam_review|                                                                 a piece of shit|192016602|NULL|
+----------+------------

Preprocessing (cleaning and generate embeddings) text for RAG workflow.

In [None]:
# Load  textual context into pandas df
df = df_textual_context.toPandas()

# Define helper functions for text cleaning and chunking
def clean_text(text):
    """Normalize text by lowercasing and removing excessive whitespace."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def chunk_text(text, chunk_size=100, overlap=20):
    """
    Split text into chunks of up to `chunk_size` words with an overlap.
    Uses simple whitespace tokenization.
    """
    words = text.split()
    if len(words) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        if end == len(words):
            break
        start = end - overlap  # Overlap for context continuity
    return chunks

                                                                                

In [None]:
# Clean and chunk the textual content
df['clean_content'] = df['content'].apply(lambda x: clean_text(x) if isinstance(x, str) else "")
df['chunks'] = df['clean_content'].apply(lambda x: chunk_text(x, chunk_size=100, overlap=20))

# Explode the list of chunks so each chunk gets its own row while preserving metadata
df_chunks = df.explode('chunks').reset_index(drop=True)
df_chunks.rename(columns={'chunks': 'text_chunk'}, inplace=True)

In [None]:
# Generate embeddings for each text chunk using a pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Choose a model that suits your needs
df_chunks['embedding'] = df_chunks['text_chunk'].apply(lambda x: model.encode(x).tolist())

Additional NLP feature engineering.

In [27]:
# Function to compute TextBlob-based sentiment and objectivity scores
def compute_textblob_sentiment(text):
    """
    Returns:
      - polarity: sentiment score between -1 (negative) and 1 (positive)
      - subjectivity: score between 0 (objective) and 1 (subjective)
      - objectivity: computed as 1 - subjectivity
    """
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    objectivity = 1 - subjectivity
    return polarity, subjectivity, objectivity

# Apply the TextBlob function to compute polarity, subjectivity, and objectivity
df_chunks[['polarity', 'subjectivity', 'objectivity']] = df_chunks['text_chunk'].apply(
    lambda x: pd.Series(compute_textblob_sentiment(x))
)

In [30]:
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute VADER sentiment scores
def compute_vader_scores(text):
    """
    Returns a pandas Series with VADER scores:
      - neg: Negative sentiment score
      - neu: Neutral sentiment score
      - pos: Positive sentiment score
      - compound: Normalized compound score (overall sentiment)
    """
    return pd.Series(sia.polarity_scores(text))

# Apply VADER to each text chunk
df_chunks[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']] = df_chunks['text_chunk'].apply(
    compute_vader_scores
)

In [32]:
# Display a preview of the DataFrame with the new sentiment features
df_chunks.info()
print(df_chunks[['text_chunk', 'polarity', 'subjectivity', 'objectivity', 
                 'vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']].head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20607 entries, 0 to 20606
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            20607 non-null  object 
 1   source          20607 non-null  object 
 2   content         20607 non-null  object 
 3   id              20607 non-null  object 
 4   url             942 non-null    object 
 5   clean_content   20607 non-null  object 
 6   text_chunk      20607 non-null  object 
 7   embedding       20607 non-null  object 
 8   polarity        20607 non-null  float64
 9   subjectivity    20607 non-null  float64
 10  objectivity     20607 non-null  float64
 11  vader_neg       20607 non-null  float64
 12  vader_neu       20607 non-null  float64
 13  vader_pos       20607 non-null  float64
 14  vader_compound  20607 non-null  float64
dtypes: float64(7), object(8)
memory usage: 2.4+ MB
                                          text_chunk  polarity  subjectivi

FAISS Index

In [33]:
# Build a FAISS index for efficient similarity search
embeddings = np.array(df_chunks['embedding'].tolist()).astype("float32")
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance index
index.add(embeddings)

# Save the FAISS index to disk for later use
faiss.write_index(index, "faiss_index.index")

# Save metadata for each chunk
metadata = df_chunks[['date', 'source', 'id', 'url', 'text_chunk']].to_dict(orient='records')
with open("faiss_metadata.json", "w") as f:
    json.dump(metadata, f, default=str)

In [36]:
!pip freeze

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


absl-py==2.1.0
accelerate==1.5.2
aiohappyeyeballs==2.6.1
aiohttp==3.11.14
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.9.0
appnope==0.1.3
asttokens==2.4.0
astunparse==1.6.3
attrs==25.3.0
backcall==0.2.0
beautifulsoup4==4.13.3
certifi==2023.7.22
charset-normalizer==3.4.1
click==8.1.8
cloudpickle==3.1.1
comm==0.1.4
contourpy==1.3.1
cycler==0.12.1
datasets==3.4.1
debugpy==1.8.0
decorator==5.1.1
dill==0.3.8
distro==1.9.0
dotenv==0.9.9
emoji==2.14.1
et_xmlfile==2.0.0
executing==2.0.0
faiss-cpu==1.10.0
filelock==3.18.0
flatbuffers==25.2.10
fonttools==4.56.0
frozendict==2.4.6
frozenlist==1.5.0
fsspec==2024.12.0
gast==0.6.0
google-pasta==0.2.0
grpcio==1.71.0
h11==0.14.0
h5py==3.13.0
httpcore==1.0.7
httpx==0.28.1
huggingface-hub==0.29.3
idna==3.10
ipykernel==6.25.2
ipython==8.16.1
jedi==0.19.1
Jinja2==3.1.6
jiter==0.9.0
joblib==1.4.2
jupyter_client==8.4.0
jupyter_core==5.4.0
keras==3.9.0
kiwisolver==1.4.8
langdetect==1.0.9
libclang==18.1.1
llvmlite==0.44.0
Markdown==3.7
markdown-it-py==3.0.