In [1]:
import gzip
import json
import os
import pickle
from document_preprocessor import RegexTokenizer
from indexing import Indexer, IndexType, BasicInvertedIndex
from ranker import *

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
DATA_PATH = 'data/'  # TODO: Set this to the path to your data folder
CACHE_PATH = '__pycache__/'  # Set this to the path of the cache folder

BEAUTY_PATH = DATA_PATH + 'meta_All_Beauty.jsonl.gz'
FASHION_PATH = DATA_PATH + 'meta_Amazon_Fashion.jsonl.gz'
COMBINE_PATH = DATA_PATH + 'Beauty_and_Fashion.jsonl.gz'
STOPWORD_PATH = DATA_PATH + 'stopwords.txt'
MAIN_INDEX = 'main_index'
TITLE_INDEX = 'title_index'
N_DOC_NEEDED = 50
DOCID_TO_TITLE_PATH = CACHE_PATH + 'docid_to_title.pkl'
DOCID_TO_LINK_PATH = CACHE_PATH + 'docid_to_link.pkl'
DOCID_TO_IMAGE_PATH = CACHE_PATH + 'docid_to_image.pkl'
DOCID_TO_ASIN_PATH = CACHE_PATH + 'docid_to_asin.pkl'
EDGELIST_PATH = DATA_PATH + 'edgelist.csv.gz'
NETWORK_STATS_PATH = DATA_PATH + 'network_stats.csv'
DOCID_TO_DESC_PATH = DATA_PATH + 'docid_to_desc.pkl'
DOCID_TO_PRICE_PATH = CACHE_PATH + 'docid_to_price.pkl'
DOCID_TO_RATING_PATH = CACHE_PATH + 'docid_to_rating.pkl'

In [4]:
# Load stopwords
stopwords = set()
with open(STOPWORD_PATH, 'r') as f:
    for line in f:
        stopwords.add(line.strip())
        
print('Loaded', len(stopwords), 'stopwords.')

Loaded 543 stopwords.


In [None]:
# Load two categories' items into one dataset
item_cnt = 0
keys_to_keep = ["main_category", "title", "average_rating", "rating_number", "price", "images", "details", "bought_together"]

def process_dataset(input_path, output_file, item_cnt):
    with gzip.open(input_path, 'rt') as infile:
        for line in infile:
            data = json.loads(line)
            if data['description'] == [] and data['features'] == []:
                continue
            item_cnt += 1
            filtered_data = {key:data[key] for key in keys_to_keep if key in data}
            filtered_data['docid'] = item_cnt
            filtered_data['description'] = " ".join(data['features'] + data['description'])
            filtered_data['link'] = "https://www.amazon.com/dp/" + data['parent_asin']
            
            ecofriendly_keywords = [sustainable, organic, biodegradable, recyclable, compostable, recycled, non-toxic, 
                                    renewable, plant-based, vegan, low-impact, zero-waste, green, cruelty-free, FSC-certified, 
                                    carbon-neutral, Energy Star, Fair Trade, eco-conscious, climate-positive, upcycled, 
                                    responsibly sourced, energy-efficient, plastic-free, pesticide-free, natural, ethical, eco-label, 
                                    water-saving, low-carbon, toxin-free, green-certified, eco-safe]
            nonfriendly_keywords = []
            
            output_file.write(json.dumps(filtered_data) + '\n')
            
    return item_cnt
if not os.path.exists(COMBINE_PATH):
    with gzip.open(COMBINE_PATH, 'wt') as outfile:
        item_cnt = process_dataset(BEAUTY_PATH, outfile, item_cnt)
        N_BEAUTY = item_cnt
        item_cnt = process_dataset(FASHION_PATH, outfile, item_cnt)
        N_FASHION = item_cnt - N_BEAUTY
        
    print(f'Added {item_cnt} items in total to {COMBINE_PATH} from both Beauty and Fashion.')

Added 493293 items in total to data/Beauty_and_Fashion.jsonl.gz from both Beauty and Fashion.


In [5]:
print('Loading indexes...')
preprocessor = RegexTokenizer('\w+')
import time
start_time = time.time()
if not os.path.exists(MAIN_INDEX):
    main_index = Indexer.create_index(
        IndexType.BasicInvertedIndex, COMBINE_PATH, preprocessor,
        stopwords, 3, text_key='description', max_docs=493293
    )
    main_index.save(MAIN_INDEX)
else:
    main_index = BasicInvertedIndex()
    main_index.load(MAIN_INDEX)
print(time.time() - start_time)    


Loading indexes...


TypeError: unhashable type: 'list'

In [None]:
start_time = time.time()
if not os.path.exists(TITLE_INDEX):
    title_index = Indexer.create_index(
        IndexType.BasicInvertedIndex, COMBINE_PATH, preprocessor, 
        stopwords, 2, max_docs=493293,
        text_key='title'
    )
    title_index.save(TITLE_INDEX)
else:
    title_index = BasicInvertedIndex()
    title_index.load(TITLE_INDEX)
print(time.time() - start_time) 

19043.13917684555


In [6]:
import gzip
import pickle
import tqdm
if not os.path.exists(DOCID_TO_TITLE_PATH):
    docid_to_title = {}
    with gzip.open(COMBINE_PATH, mode = 'rt', newline = '') as f:
        for line in f:
            data = json.loads(line)
            docid_to_title[data['docid']] = data['title']
    pickle.dump(docid_to_title,
                open(DOCID_TO_TITLE_PATH, 'wb')
    )
else:
    docid_to_title = pickle.load(open(DOCID_TO_TITLE_PATH, 'rb'))

In [12]:
if not os.path.exists(DOCID_TO_LINK_PATH):
    docid_to_link = {}
    with gzip.open(COMBINE_PATH, mode = 'rt', newline = '') as f:
        for line in f:
            data = json.loads(line)
            docid_to_link[data['docid']] = data['link']
    pickle.dump(docid_to_link,
                open(DOCID_TO_LINK_PATH, 'wb')
    )
else:
    docid_to_link = pickle.load(open(DOCID_TO_LINK_PATH, 'rb'))

In [13]:
docid_to_link[1]

'https://www.amazon.com/dp/B07NGFDN6G'

In [34]:
with gzip.open(COMBINE_PATH, mode = 'rt', newline = '') as f:
    for line in f:
        data = json.loads(line)
        if data['docid'] == 466888:
            print(data['images'])

[{'thumb': 'https://m.media-amazon.com/images/I/41PrrfdmXiL._AC_US40_.jpg', 'large': 'https://m.media-amazon.com/images/I/41PrrfdmXiL._AC_.jpg', 'variant': 'MAIN', 'hi_res': 'https://m.media-amazon.com/images/I/710UfRPEckS._AC_UL1500_.jpg'}, {'thumb': 'https://m.media-amazon.com/images/I/51qnfppvOyL._AC_US40_.jpg', 'large': 'https://m.media-amazon.com/images/I/51qnfppvOyL._AC_.jpg', 'variant': 'FRNT', 'hi_res': None}, {'thumb': 'https://m.media-amazon.com/images/I/411wkSB7WPL._AC_US40_.jpg', 'large': 'https://m.media-amazon.com/images/I/411wkSB7WPL._AC_.jpg', 'variant': 'BACK', 'hi_res': None}, {'thumb': 'https://m.media-amazon.com/images/I/41u7G2n-CaL._AC_US40_.jpg', 'large': 'https://m.media-amazon.com/images/I/41u7G2n-CaL._AC_.jpg', 'variant': 'BOTT', 'hi_res': 'https://m.media-amazon.com/images/I/71V8WrLg6KS._AC_UL1500_.jpg'}, {'thumb': 'https://m.media-amazon.com/images/I/41tRjL2vzGL._AC_US40_.jpg', 'large': 'https://m.media-amazon.com/images/I/41tRjL2vzGL._AC_.jpg', 'variant': 'T

In [44]:
import requests
from tqdm import tqdm
# if not os.path.exists(DOCID_TO_IMAGE_PATH):
docid_to_image = {}
with gzip.open(COMBINE_PATH, mode = 'rt', newline = '') as f:
    for line in tqdm(f):
        data = json.loads(line)
        if data['docid'] == 466888:
            docid_to_image[data['docid']] = data['images'][0]['hi_res']
        elif len(data['images']) > 0 and data['images'][0]['variant'] != 'FSCH':
            # try:
            #     response = requests.get(data['images'][0]['large'], stream=True)
            #     response.raise_for_status()
            #     docid_to_image[data['docid']] = data['images'][0]['large']
            # except requests.exceptions.RequestException as e:
            #     print(f"Error fetching image from 'large': {e}")
                
            #     try:
            #         response = requests.get(data['images'][0]['hi_res'], stream=True)
            #         response.raise_for_status()
            #         docid_to_image[data['docid']] = data['images'][0]['hi_res']
            #     except requests.exceptions.RequestException as e:
            #         print(f"Error fetching image from 'hi_res': {e}")
            #         docid_to_image[data['docid']] = None
            docid_to_image[data['docid']] = data['images'][0]['large']
        else:
            docid_to_image[data['docid']] = ""
pickle.dump(docid_to_image,
            open(DOCID_TO_IMAGE_PATH, 'wb')
)
# else:
#     docid_to_image = pickle.load(open(DOCID_TO_IMAGE_PATH, 'rb'))

0it [00:00, ?it/s]

493293it [00:16, 30581.47it/s]


In [47]:
docid_to_image[5174]

'https://m.media-amazon.com/images/I/41PiajdC5oL.jpg'

In [12]:
if not os.path.exists(DOCID_TO_ASIN_PATH):
    docid_to_asin = {}
    with gzip.open(COMBINE_PATH, mode = 'rt', newline = '') as f:
        for line in f:
            data = json.loads(line)
            docid_to_asin[data['docid']] = data['link'].split("www.amazon.com/dp/", 1)[1]
    pickle.dump(docid_to_asin,
                open(DOCID_TO_ASIN_PATH, 'wb')
    )
else:
    docid_to_asin = pickle.load(open(DOCID_TO_ASIN_PATH, 'rb'))

In [24]:
if not os.path.exists(DOCID_TO_DESC_PATH):
    docid_to_desc = {}
    with gzip.open(COMBINE_PATH, mode = 'rt', newline = '') as f:
        for line in f:
            data = json.loads(line)
            docid_to_desc[data['docid']] = data['description']
    pickle.dump(docid_to_desc,
                open(DOCID_TO_DESC_PATH, 'wb')
    )
else:
    docid_to_desc = pickle.load(open(DOCID_TO_DESC_PATH, 'rb'))

In [25]:
docid_to_desc

{1: 'Material: 304 Stainless Steel; Brass tip Lengths Available: 88mm, 93mm, 98mm Accepts cartridge needles with vice style tattoo machines Works perfectly with Precision Disposable Soft Cartridge Grips Price per one bag of 10 plungers The Precision Plunger Bars are designed to work seamlessly with the\xa0Precision Disposable 1. 25" Contoured Soft Cartridge Grips\xa0and the\xa0Precision Disposable 1" Textured Soft Cartridge Grips\xa0to drive cartridge needles with vice style or standard tattoo machine setups. These plunger bars are manufactured from 304 Stainless Steel and feature a brass tip. The plungers are sold in a bag of ten in your choice of 88mm, 93mm, or 98mm length.',
 2: "The false toenails are durable with perfect length. You have the option to wear them long or clip them short, easy to trim and file them to in any length and shape you like. ABS is kind of green enviromental material, and makes the nails durable, breathable, light even no pressure on your own nails. Fit wel

In [3]:
if not os.path.exists(DOCID_TO_PRICE_PATH):
    docid_to_price = {}
    with gzip.open(COMBINE_PATH, mode = 'rt', newline = '') as f:
        for line in f:
            data = json.loads(line)
            docid_to_price[data['docid']] = data['price']
    pickle.dump(docid_to_price,
                open(DOCID_TO_PRICE_PATH, 'wb')
    )
else:
    docid_to_price = pickle.load(open(DOCID_TO_PRICE_PATH, 'rb'))

In [5]:
price_count = 0
for id, price in docid_to_price.items():
    if price is not None:
        price_count += 1
price_count

48602

In [7]:
if not os.path.exists(DOCID_TO_RATING_PATH):
    docid_to_rating = {}
    with gzip.open(COMBINE_PATH, mode = 'rt', newline = '') as f:
        for line in f:
            data = json.loads(line)
            docid_to_rating[data['docid']] = data['average_rating']
    pickle.dump(docid_to_rating,
                open(DOCID_TO_RATING_PATH, 'wb')
    )
else:
    docid_to_rating = pickle.load(open(DOCID_TO_RATING_PATH, 'rb'))

In [8]:
rating_count = 0
for id, rating in docid_to_rating.items():
    if rating is not None:
        rating_count += 1
rating_count

493293

In [11]:
from transformers import pipeline

# Load a pre-trained sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# List of product descriptions
# descriptions = [
#     "This product is made from 100% recycled materials and is fully biodegradable.",
#     "Packaged in single-use plastics and contains harmful chemicals.",
#     "Crafted with sustainably sourced organic cotton and environmentally friendly dyes.",
#     "Energy-intensive production methods are used to manufacture this product.",
#     "Designed to be reusable and significantly reduce waste in landfills."
# ]
descriptions = [desc for docid, desc in docid_to_desc.items() if docid < 20]

# Analyze sentiment for each description
results = []
for desc in descriptions:
    sentiment = sentiment_analyzer(desc)
    label = sentiment[0]['label']  # 'POSITIVE' or 'NEGATIVE'
    score = sentiment[0]['score']  # Confidence score
    eco_friendly = label == "POSITIVE"  # Classify as eco-friendly if sentiment is positive
    results.append({
        "description": desc,
        "sentiment": label,
        "confidence": score,
        "eco_friendly": eco_friendly
    })

# Display results
for result in results:
    print(f"Description: {result['description']}")
    print(f"  Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.2f})")
    print(f"  Eco-Friendly: {result['eco_friendly']}")
    print()

Description: Material: 304 Stainless Steel; Brass tip Lengths Available: 88mm, 93mm, 98mm Accepts cartridge needles with vice style tattoo machines Works perfectly with Precision Disposable Soft Cartridge Grips Price per one bag of 10 plungers The Precision Plunger Bars are designed to work seamlessly with the Precision Disposable 1. 25" Contoured Soft Cartridge Grips and the Precision Disposable 1" Textured Soft Cartridge Grips to drive cartridge needles with vice style or standard tattoo machine setups. These plunger bars are manufactured from 304 Stainless Steel and feature a brass tip. The plungers are sold in a bag of ten in your choice of 88mm, 93mm, or 98mm length.
  Sentiment: POSITIVE (Confidence: 0.99)
  Eco-Friendly: True

Description: The false toenails are durable with perfect length. You have the option to wear them long or clip them short, easy to trim and file them to in any length and shape you like. ABS is kind of green enviromental material, and makes the nails durab

In [39]:
ranker = Ranker(main_index, preprocessor, stopwords, BM25(main_index))

import pandas as pd
import requests
import random
beauty_queries = ["Hydrating face serum",
"Organic lip balm",
"Sunscreen spf 50",
"Matte foundation",
"Hair repair oil",
"Anti-aging night cream for sensitive skin",
"Cruelty-free makeup set",
"Gentle facial cleanser with natural ingredients",
"Long-lasting waterproof mascara",
"Shampoo and conditioner set for curly hair",
]
fashion_queries = ["Maxi dress",
"Crop top",
"V-neck t-shirt",
"Gray baggy jeans",
"Wool scarf",
"Running shoes with cushions",
"Lightweight travel backpack",
"High-waisted leggings with pockets",
"Casual blazer for men in slim fit style",
"Kids’ winter coat waterproof",
]

def check_amazon_item_exists(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    time.sleep(random.uniform(2, 5))
    response = requests.get(url, headers=headers)

    # Check if the page exists based on the HTTP status code
    if response.status_code == 200:
        if "currently unavailable" in response.text or "couldn't find that page" in response.text:
            return False
        else:
            return True
    elif response.status_code == 404:
        return False
    else:
        return False

# for beauty_query in beauty_queries:
#     doc_lst = ranker.query(beauty_query)[:N_DOC_NEEDED]
#     df = pd.DataFrame(columns=['query','title','docid','link','rel'])
#     for i in range(len(doc_lst)):
#         df.loc[i] = [beauty_query, docid_to_title[doc_lst[i][0]], doc_lst[i][0], docid_to_link[doc_lst[i][0]], None]
#     df.to_csv(beauty_query+'.csv', index=False)
    
for beauty_query in beauty_queries:
    doc_lst = ranker.query(beauty_query)
    df = pd.DataFrame(columns=['query', 'title', 'docid', 'link', 'rel'])

    valid_docs_count = 0
    for i in range(len(doc_lst)):
        docid = doc_lst[i][0]
        url = docid_to_link[docid]
        
        # Check if the URL is valid
        if check_amazon_item_exists(url):
            # Add the document to the DataFrame if URL is valid
            df.loc[valid_docs_count] = [beauty_query, docid_to_title[docid], docid, url, None]
            valid_docs_count += 1
            
            # Stop if we reach the required number of valid docs
            if valid_docs_count >= N_DOC_NEEDED:
                break

    # Save to CSV after collecting enough valid documents
    df.to_csv(f"{beauty_query}.csv", index=False)

# for fashion_query in fashion_queries:
#     doc_lst = ranker.query(fashion_query)[:N_DOC_NEEDED]
#     df = pd.DataFrame(columns=['query','title','docid','link','rel'])
#     for i in range(len(doc_lst)):
#         df.loc[i] = [fashion_query, docid_to_title[doc_lst[i][0]], doc_lst[i][0], docid_to_link[doc_lst[i][0]], None]
#     df.to_csv(fashion_query+'.csv', index=False)
    
for fashion_query in fashion_queries:
    doc_lst = ranker.query(fashion_query)
    df = pd.DataFrame(columns=['query', 'title', 'docid', 'link', 'rel'])

    valid_docs_count = 0
    for i in range(len(doc_lst)):
        docid = doc_lst[i][0]
        url = docid_to_link[docid]
        
        # Check if the URL is valid
        if check_amazon_item_exists(url):
            # Add the document to the DataFrame if URL is valid
            df.loc[valid_docs_count] = [fashion_query, docid_to_title[docid], docid, url, None]
            valid_docs_count += 1
            
            # Stop if we reach the required number of valid docs
            if valid_docs_count >= N_DOC_NEEDED:
                break

    # Save to CSV after collecting enough valid documents
    df.to_csv(f"{fashion_query}.csv", index=False)

In [13]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import requests

In [17]:
model = SentenceTransformer('clip-ViT-B-32')

def get_image(url):
    return Image.open(requests.get(url, stream=True).raw)

doc1_img_emb = model.encode(get_image('https://m.media-amazon.com/images/I/31TgqAZ8kQL.jpg'))

text_emb = model.encode('plunger bars')

cos_scores = util.cos_sim(doc1_img_emb, text_emb)
print(cos_scores)

tensor([[0.2760]])


In [3]:
import pandas as pd
folder_path = 'annotated_files/'
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
combined_df = pd.concat([pd.read_csv(file) for file in all_files], ignore_index=True)

In [5]:
combined_df = combined_df.iloc[:,:-1]
combined_df

Unnamed: 0,query,title,docid,link,rel
0,V-neck t-shirt,"Vedolay Women Short Sleeve Tops, Womens Leopar...",150313,https://www.amazon.com/dp/B086C7WLCY,3.0
1,V-neck t-shirt,Haola Women's Summer Loose Fit V Neck Short Sl...,149693,https://www.amazon.com/dp/B083QZZKG4,4.0
2,V-neck t-shirt,Haola Women's Summer Loose Fit V Neck Short Sl...,326228,https://www.amazon.com/dp/B083R1YTPS,4.0
3,V-neck t-shirt,Tommy Hilfiger Women's Short Sleeve V-Neck T-S...,70531,https://www.amazon.com/dp/B08P2QXKQM,5.0
4,V-neck t-shirt,Grimm Storytime Is Over Ladies Junior Fit V-Ne...,466056,https://www.amazon.com/dp/B00NF16LMM,5.0
...,...,...,...,...,...
2998,Hair repair oil,UNA Hair Food Jojoba Oil Hair Treatment 34oz(1...,22061,https://www.amazon.com/dp/B002KM8C7O,3.0
2999,Hair repair oil,"K I.C.O.N. I.C.O.N. India Curl Cream, Wave and...",14938,https://www.amazon.com/dp/B00DGXXPF0,3.0
3000,Hair repair oil,"Briogeo Don't Despair, Repair! Overnight Repai...",10478,https://www.amazon.com/dp/B01JQCTES6,3.0
3001,Hair repair oil,Knight's Exclusives Hair & Body Oil 4oz,20244,https://www.amazon.com/dp/B079NRRBJB,3.0


In [8]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(combined_df, test_size=1 - 0.7, random_state=650)
train_df.to_csv('data/training_set.csv', index=False)
test_df.to_csv('data/testing_set.csv', index=False)

In [13]:
KEYWORD_LABELED_PATH = DATA_PATH + 'eco_keyword_labeled.jsonl.gz'
SENTIMENT_LABELED_PATH = DATA_PATH + 'sentiment_labeled.jsonl.gz'

In [6]:
count = 0
keyword_labeled_dict = {}
with gzip.open(KEYWORD_LABELED_PATH, mode = 'rt', newline = '') as f:
    for line in f:
        data = json.loads(line)
        count += 1
        if count > 20000:
            break
        keyword_labeled_dict[data['docid']] = data['eco_friendly']

In [18]:
count = 0
sentiment_labeled_dict = {}
with gzip.open(SENTIMENT_LABELED_PATH, mode = 'rt', newline = '') as f:
    for line in f:
        data = json.loads(line)
        count += 1
        if count > 20000:
            break
        sentiment_labeled_dict[data['docid']] = data['eco_friendly']

In [10]:
keyword_labeled_dict

{1: False,
 2: True,
 3: False,
 4: False,
 5: False,
 6: True,
 7: True,
 8: False,
 9: False,
 10: False,
 11: True,
 12: False,
 13: False,
 14: False,
 15: False,
 16: True,
 17: True,
 18: False,
 19: False,
 20: False,
 21: False,
 22: True,
 23: False,
 24: False,
 25: False,
 26: True,
 27: False,
 28: False,
 29: False,
 30: False,
 31: False,
 32: True,
 33: False,
 34: False,
 35: False,
 36: False,
 37: False,
 38: False,
 39: False,
 40: True,
 41: True,
 42: False,
 43: False,
 44: False,
 45: False,
 46: False,
 47: True,
 48: True,
 49: False,
 50: True,
 51: False,
 52: True,
 53: True,
 54: True,
 55: False,
 56: True,
 57: False,
 58: True,
 59: False,
 60: False,
 61: False,
 62: False,
 63: False,
 64: False,
 65: False,
 66: False,
 67: False,
 68: True,
 69: False,
 70: False,
 71: True,
 72: False,
 73: False,
 74: False,
 75: False,
 76: False,
 77: False,
 78: False,
 79: False,
 80: False,
 81: False,
 82: False,
 83: False,
 84: False,
 85: False,
 86: True,

In [19]:
sentiment_labeled_dict

{1: False,
 2: True,
 3: False,
 4: False,
 5: False,
 6: True,
 7: False,
 8: False,
 9: False,
 10: False,
 11: True,
 12: False,
 13: True,
 14: False,
 15: True,
 16: True,
 17: True,
 18: False,
 19: False,
 20: False,
 21: False,
 22: True,
 23: False,
 24: False,
 25: False,
 26: True,
 27: False,
 28: False,
 29: False,
 30: False,
 31: False,
 32: False,
 33: False,
 34: False,
 35: False,
 36: False,
 37: False,
 38: False,
 39: False,
 40: True,
 41: False,
 42: False,
 43: False,
 44: False,
 45: False,
 46: False,
 47: True,
 48: False,
 49: False,
 50: True,
 51: False,
 52: True,
 53: True,
 54: True,
 55: False,
 56: True,
 57: False,
 58: False,
 59: False,
 60: False,
 61: False,
 62: False,
 63: False,
 64: False,
 65: False,
 66: False,
 67: False,
 68: False,
 69: False,
 70: False,
 71: True,
 72: False,
 73: False,
 74: False,
 75: False,
 76: False,
 77: False,
 78: False,
 79: False,
 80: False,
 81: False,
 82: False,
 83: False,
 84: False,
 85: False,
 86: F

In [22]:
difference_lst = []
for i in range(1,20001):
    if keyword_labeled_dict[i] != sentiment_labeled_dict[i]:
        difference_lst.append((i, keyword_labeled_dict[i], sentiment_labeled_dict[i]))
        
print(difference_lst)

[(7, True, False), (13, False, True), (15, False, True), (32, True, False), (41, True, False), (48, True, False), (58, True, False), (68, True, False), (86, True, False), (91, True, False), (92, True, False), (99, True, False), (106, False, True), (123, False, True), (133, True, False), (134, True, False), (141, True, False), (146, True, False), (158, True, False), (165, False, True), (169, True, False), (184, True, False), (185, False, True), (186, True, False), (187, True, False), (201, False, True), (202, False, True), (220, True, False), (236, True, False), (240, True, False), (244, True, False), (245, True, False), (250, False, True), (265, True, False), (281, True, False), (293, True, False), (294, True, False), (318, False, True), (322, True, False), (326, True, False), (344, True, False), (345, True, False), (375, True, False), (386, False, True), (391, False, True), (407, True, False), (443, False, True), (444, True, False), (461, True, False), (462, False, True), (467, True, 

In [30]:
docid_to_desc[41]

'THE SECRET TO SHINE – Do you fill with envy every time you see people that seem blessed with beautiful hair? It’s time to take control of your strands! Get the commercial-worthy hair that turns heads, by treating your hair to the tress-taming magic of Osensia’s Argan Oil and Keratin hair treatment mask. Watch as those eyes of envy turn to you, and decide whether you want to let them in on the Osensia secret. GIVE YOUR HAIR WHAT IT CRAVES – Every day your hair deals with a damaging onslaught from hot tools to environmental pollutants, so you shouldn’t be surprised that it lacks the luster you dream about. Luckily, with Osensia’s repair conditioner treatment you can restore, strengthen, and protect for longer, thicker, faster growing locks. Discover the hair you were born to have. NATURALLY STUNNING STRANDS – Tired of spending top dollar on salon brands that don’t live up to their promises? We packed Osensia’s Sulfate Free, Paraben Free intensive conditioning formula full of some of nat