# EDA on the MIND Dataset

The dataset used in this project can be found on Kaggle: [MIND: Microsoft News Recommendation Dataset](https://www.kaggle.com/datasets/arashnic/mind-news-dataset)

In [None]:
import pandas as pd
import numpy as np

col_names = ["impression_id", "user_id", "time", "history", "impressions"]
behaviors = pd.read_csv("behaviors.tsv", sep="\t", names=col_names, quoting=3)

col_names = ["id", "category", "subcategory", "title", "abstract", "link", "title_entities", "abstract_entities"]
news = pd.read_csv("news.tsv", sep="\t", names=col_names, quoting=3)


def load_vec_file(path):
    embeddings = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) <= 2:
                continue
            key = parts[0]
            vec = np.array(list(map(float, parts[1:])))
            embeddings[key] = vec
    return embeddings

entity_embeddings = load_vec_file("entity_embedding.vec")
relation_embeddings = load_vec_file("relation_embedding.vec")

# News and behaviors

In [10]:
behaviors.head()

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [11]:
news.head()

Unnamed: 0,id,category,subcategory,title,abstract,link,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


First let's remove missing values

In [17]:
print(news.isnull().sum())
print("Total missing:", news.isnull().sum().sum())

id                      0
category                0
subcategory             0
title                   0
abstract             2666
link                    0
title_entities          0
abstract_entities       0
dtype: int64
Total missing: 2666


In [30]:
news = news.dropna(subset=['abstract'])
print(news.isnull().sum())
print("Total missing:", news.isnull().sum().sum())

id                   0
category             0
subcategory          0
title                0
abstract             0
link                 0
title_entities       0
abstract_entities    0
dtype: int64
Total missing: 0


In [29]:
print(behaviors.isnull().sum())
print("Total missing:", behaviors.isnull().sum().sum())

impression_id       0
user_id             0
time                0
history          3238
impressions         0
dtype: int64
Total missing: 3238


In [31]:
behaviors = behaviors.dropna(subset=['history'])
print(behaviors.isnull().sum())
print("Total missing:", behaviors.isnull().sum().sum())

impression_id    0
user_id          0
time             0
history          0
impressions      0
dtype: int64
Total missing: 0


# Entities and Relations

In [13]:
print("Entity example:", list(entity_embeddings.items())[:1])
print("Relation example:", list(relation_embeddings.items())[:1])

Entity example: [('Q41', array([-0.063388, -0.181451,  0.057501, -0.091254, -0.076217, -0.052525,
        0.0505  , -0.224871, -0.018145,  0.030722,  0.064276,  0.073063,
        0.039489,  0.159404, -0.128784,  0.016325,  0.026797,  0.13709 ,
        0.001849, -0.059103,  0.012091,  0.045418,  0.000591,  0.211337,
       -0.034093, -0.074582,  0.014004, -0.099355,  0.170144,  0.109376,
       -0.014797,  0.071172,  0.080375,  0.045563, -0.046462,  0.070108,
        0.015413, -0.020874, -0.170324, -0.00113 ,  0.05981 ,  0.054342,
        0.027358, -0.028995, -0.224508,  0.066281, -0.200006,  0.018186,
        0.082396,  0.167178, -0.136239,  0.055134, -0.080195, -0.00146 ,
        0.031078, -0.017084, -0.091176, -0.036916,  0.124642, -0.098185,
       -0.054836,  0.152483, -0.053712,  0.092816, -0.112044, -0.072247,
       -0.114896, -0.036541, -0.186339, -0.16061 ,  0.037342, -0.133474,
        0.11008 ,  0.070678, -0.005586, -0.046667, -0.07201 ,  0.086424,
        0.026165,  0.03056