# H&M Fashion Data — Verification Notebook

This notebook verifies that all datasets are accessible and load correctly with PySpark.

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("HM-Data-Verification")
    .master("local[*]")
    .config("spark.driver.memory", "2g")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")
print("Spark session created:", spark.version)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/25 13:05:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session created: 3.2.0


## 1. Articles

In [2]:
articles = spark.read.csv("/app/data/raw/articles.csv", header=True, inferSchema=True)
print(f"Rows: {articles.count():,}  |  Columns: {len(articles.columns)}")
print(f"Columns: {articles.columns}")
articles.printSchema()

                                                                                

Rows: 105,542  |  Columns: 25
Columns: ['article_id', 'product_code', 'prod_name', 'product_type_no', 'product_type_name', 'product_group_name', 'graphical_appearance_no', 'graphical_appearance_name', 'colour_group_code', 'colour_group_name', 'perceived_colour_value_id', 'perceived_colour_value_name', 'perceived_colour_master_id', 'perceived_colour_master_name', 'department_no', 'department_name', 'index_code', 'index_name', 'index_group_no', 'index_group_name', 'section_no', 'section_name', 'garment_group_no', 'garment_group_name', 'detail_desc']
root
 |-- article_id: integer (nullable = true)
 |-- product_code: integer (nullable = true)
 |-- prod_name: string (nullable = true)
 |-- product_type_no: integer (nullable = true)
 |-- product_type_name: string (nullable = true)
 |-- product_group_name: string (nullable = true)
 |-- graphical_appearance_no: integer (nullable = true)
 |-- graphical_appearance_name: string (nullable = true)
 |-- colour_group_code: integer (nullable = true)
 |

In [3]:
articles.show(5, truncate=False)

+----------+------------+-----------------+---------------+-----------------+------------------+-----------------------+-------------------------+-----------------+-----------------+-------------------------+---------------------------+--------------------------+----------------------------+-------------+---------------+----------+----------------+--------------+----------------+----------+----------------------+----------------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|article_id|product_code|prod_name        |product_type_no|product_type_name|product_group_name|graphical_appearance_no|graphical_appearance_name|colour_group_code|colour_group_name|perceived_colour_value_id|perceived_colour_value_name|perceived_colour_master_id|perceived_colour_master_name|department_no|

## 2. Customers

In [4]:
customers = spark.read.csv("/app/data/raw/customers.csv", header=True, inferSchema=True)
print(f"Rows: {customers.count():,}  |  Columns: {len(customers.columns)}")
print(f"Columns: {customers.columns}")
customers.printSchema()

[Stage 8:====>                                                    (1 + 11) / 12]

Rows: 1,371,980  |  Columns: 7
Columns: ['customer_id', 'FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age', 'postal_code']
root
 |-- customer_id: string (nullable = true)
 |-- FN: double (nullable = true)
 |-- Active: double (nullable = true)
 |-- club_member_status: string (nullable = true)
 |-- fashion_news_frequency: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- postal_code: string (nullable = true)



                                                                                

In [5]:
customers.show(5, truncate=False)

+----------------------------------------------------------------+----+------+------------------+----------------------+---+----------------------------------------------------------------+
|customer_id                                                     |FN  |Active|club_member_status|fashion_news_frequency|age|postal_code                                                     |
+----------------------------------------------------------------+----+------+------------------+----------------------+---+----------------------------------------------------------------+
|00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657|null|null  |ACTIVE            |NONE                  |49 |52043ee2162cf5aa7ee79974281641c6f11a68d276429a91f8ca0d4b6efa8100|
|0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa|null|null  |ACTIVE            |NONE                  |25 |2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93f4c830291c32bc3057|
|000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8f

## 3. Transactions

In [6]:
transactions = spark.read.csv("/app/data/raw/transactions_train.csv", header=True, inferSchema=True)
print(f"Rows: {transactions.count():,}  |  Columns: {len(transactions.columns)}")
print(f"Columns: {transactions.columns}")
transactions.printSchema()



Rows: 31,788,324  |  Columns: 5
Columns: ['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id']
root
 |-- t_dat: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- article_id: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- sales_channel_id: integer (nullable = true)



                                                                                

In [7]:
transactions.show(5, truncate=False)

+----------+----------------------------------------------------------------+----------+--------------------+----------------+
|t_dat     |customer_id                                                     |article_id|price               |sales_channel_id|
+----------+----------------------------------------------------------------+----------+--------------------+----------------+
|2018-09-20|000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318|663713001 |0.050830508474576264|2               |
|2018-09-20|000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318|541518023 |0.03049152542372881 |2               |
|2018-09-20|00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2|505221004 |0.01523728813559322 |2               |
|2018-09-20|00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2|685687003 |0.016932203389830508|2               |
|2018-09-20|00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2|685687004 |0.016932203389830508|2 

## Summary

In [8]:
print("=" * 50)
print("  VERIFICATION SUMMARY")
print("=" * 50)
for name, df in [("articles", articles), ("customers", customers), ("transactions", transactions)]:
    print(f"  ✓  {name:15s}  rows={df.count():>12,}  cols={len(df.columns)}")
print("=" * 50)
print("  All datasets loaded successfully!")

  VERIFICATION SUMMARY
  ✓  articles         rows=     105,542  cols=25


                                                                                

  ✓  customers        rows=   1,371,980  cols=7




  ✓  transactions     rows=  31,788,324  cols=5
  All datasets loaded successfully!


                                                                                

In [9]:
spark.stop()
print("Spark session stopped.")

Spark session stopped.
