In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
news_col = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entity', 'abstract_entity']
behaviors_col = ['impression_id', 'user', 'time', 'clicked_news', 'impressions']

news_df = pd.read_csv('./train/train_news.tsv', sep='\t', names=news_col, index_col='news_id')
behav_df = pd.read_csv('./train/train_behaviors.tsv', sep='\t', names=behaviors_col, index_col='impression_id')
entity_vec = pd.read_csv('./train/train_entity_embedding.vec', sep='\t', 
                         names=['WikidataId'] + list(range(100)), 
                         usecols=list(range(101)),
                         index_col='WikidataId')

In [3]:
# check columns with missing values
print('==== Columns w/ Missing Values ====')
print('[News Data]')
for col in news_df.columns:
    if news_df[col].isna().any():
        print(col)
        
print('\n[Behaviors Data]')
for col in behav_df.columns:
    if behav_df[col].isna().any():
        print(col)

==== Columns w/ Missing Values ====
[News Data]
abstract
title_entity
abstract_entity

[Behaviors Data]


In [4]:
# 資料沒有異常，單純缺值
news_df[news_df['abstract'].isna()]

Unnamed: 0_level_0,category,subcategory,title,abstract,url,title_entity,abstract_entity
news_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
N603780,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
N40392,health,cardio,Check Houston traffic map for current road con...,,https://assets.msn.com/labs/mind/AAEKnO1.html,"[{""Label"": ""Houston"", ""Type"": ""G"", ""WikidataId...",[]
N67055,finance,finance-insurance,The States Where Dogs Put the Biggest Bite on ...,,https://assets.msn.com/labs/mind/AABrOCk.html,[],[]
N493460,sports,football_ncaa,Rutgers throws for 1 yard in 35-0 loss at Indiana,,https://assets.msn.com/labs/mind/AAIGv2m.html,"[{""Label"": ""Rutgers Scarlet Knights football"",...",[]
N96749,lifestyle,lifestylebuzz,"Mom with schizophrenia, 6-year-old daughter mi...",,https://assets.msn.com/labs/mind/AAJfTsZ.html,"[{""Label"": ""Queens"", ""Type"": ""G"", ""WikidataId""...",[]
...,...,...,...,...,...,...,...
N819583,sports,football_nfl,Week 10 Game Balls: Few Bright Spots in Colts ...,,https://assets.msn.com/labs/mind/BBWz39w.html,"[{""Label"": ""Bright spots on Ceres"", ""Type"": ""U...",[]
N139372,news,newsus,Winter homeless shelter remains unopened after...,,https://assets.msn.com/labs/mind/BBWzAPK.html,[],[]
N430180,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,,https://assets.msn.com/labs/mind/BBWzQYV.html,"[{""Label"": ""Broadway theatre"", ""Type"": ""F"", ""W...",[]
N522314,autos,autossports,Best Sports Car Deals for October,,https://assets.msn.com/labs/mind/BBy5rVe.html,"[{""Label"": ""Peugeot RCZ"", ""Type"": ""V"", ""Wikida...",[]


In [6]:
# 顯然是原始資料沒有切好
# [Case 1] N773033,N793536,N319440: url跟title_entity都被併到abstract
# [Case 2] N519444,N828921,N714065: abstract被併到title
news_df[news_df['abstract_entity'].isna()]

Unnamed: 0_level_0,category,subcategory,title,abstract,url,title_entity,abstract_entity
news_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
N773033,lifestyle,lifestylefamilyandrelationships,The 50 Most Common Last Names in America,What's in a name?\thttps://assets.msn.com/labs...,[],,
N519444,finance,finance-video,The Price You Pay: The spiraling cost of colle...,https://assets.msn.com/labs/mind/AAIKUGl.html,[],[],
N793536,sports,football_nfl,Baker Mayfield injury update: 'No doubt' Brown...,He'll be in better shape physically and he'll ...,"[{""Label"": ""Freddie Kitchens"", ""Type"": ""P"", ""W...",,
N828921,news,newsus,The security has been stepped up': Forest Park...,https://assets.msn.com/labs/mind/AAJncPt.html,"[{""Label"": ""Forest Park\u2013DeBaliviere stati...","[{""Label"": ""St. Louis"", ""Type"": ""G"", ""Wikidata...",
N714065,news,newsus,[We] walk dogs and pick up poo for free | Why ...,https://assets.msn.com/labs/mind/AAJUQ4w.html,"[{""Label"": ""Animal control service"", ""Type"": ""...","[{""Label"": ""Animal control service"", ""Type"": ""...",
N319440,video,news,Transcripts highlight Sean Hannity's political...,We're seeing really disturbing things coming o...,"[{""Label"": ""Sean Hannity"", ""Type"": ""P"", ""Wikid...",,


In [7]:
# 處理 case 1
anomaly_ids = ['N773033','N793536','N319440']
anomaly_cols = ['abstract', 'url', 'title_entity', 'abstract_entity']
anomaly = news_df.loc[anomaly_ids]['abstract'].str.split('\t')
for news_id in anomaly.index:
    anomaly[news_id].append(news_df.loc[news_id, 'url'])
    
# pd.DataFrame(anomaly.tolist(), columns=anomaly_cols, index=anomaly_ids)
news_df.loc[anomaly_ids, anomaly_cols] = pd.DataFrame(anomaly.tolist(), columns=anomaly_cols, index=anomaly_ids)

In [8]:
# 處理 case 2
anomaly_ids = ['N519444','N828921','N714065']
anomaly_cols = ['title', 'abstract', 'url', 'title_entity', 'abstract_entity']

# abstract後的資料都往後移一格
news_df.loc[anomaly_ids, anomaly_cols[2:]] = news_df.loc[anomaly_ids, anomaly_cols[1:4]].values

anomaly = news_df.loc[anomaly_ids]['title'].str.split('\t')
news_df.loc[anomaly_ids, ['title', 'abstract']] = pd.DataFrame(anomaly.tolist(), columns=['title', 'abstract'], index=anomaly_ids)

In [9]:
news_df.loc[['N773033','N793536','N319440', 'N519444','N828921','N714065']]

Unnamed: 0_level_0,category,subcategory,title,abstract,url,title_entity,abstract_entity
news_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
N773033,lifestyle,lifestylefamilyandrelationships,The 50 Most Common Last Names in America,What's in a name?,https://assets.msn.com/labs/mind/AAGO6bg.html,"[{Label"": ""Surname"", ""Type"": ""C"", ""WikidataId""...",[]
N793536,sports,football_nfl,Baker Mayfield injury update: 'No doubt' Brown...,He'll be in better shape physically and he'll ...,https://assets.msn.com/labs/mind/AAIPbTZ.html,"[{Label"": ""Baker Mayfield"", ""Type"": ""P"", ""Wiki...","[{""Label"": ""Freddie Kitchens"", ""Type"": ""P"", ""W..."
N319440,video,news,Transcripts highlight Sean Hannity's political...,We're seeing really disturbing things coming o...,https://assets.msn.com/labs/mind/BBWymjv.html,"[{Label"": ""Sean Hannity"", ""Type"": ""P"", ""Wikida...","[{""Label"": ""Sean Hannity"", ""Type"": ""P"", ""Wikid..."
N519444,finance,finance-video,The Price You Pay: The spiraling cost of college,"In our new series, The Price You Pay,"" we look...",https://assets.msn.com/labs/mind/AAIKUGl.html,[],[]
N828921,news,newsus,The security has been stepped up': Forest Park...,ST. LOUIS (KMOV.com) -- Metro security in one ...,https://assets.msn.com/labs/mind/AAJncPt.html,"[{""Label"": ""Forest Park\u2013DeBaliviere stati...","[{""Label"": ""St. Louis"", ""Type"": ""G"", ""Wikidata..."
N714065,news,newsus,[We] walk dogs and pick up poo for free | Why ...,A damning audit from July into the St. Louis C...,https://assets.msn.com/labs/mind/AAJUQ4w.html,"[{""Label"": ""Animal control service"", ""Type"": ""...","[{""Label"": ""Animal control service"", ""Type"": ""..."


In [11]:
news_df.to_csv('train/fixed_train_news.tsv', sep='\t')