In [38]:
import pandas as pd

# Display all rows
pd.set_option('display.max_rows', None)

# Display all columns
pd.set_option('display.max_columns', None)

# Increase the display width to show full text
pd.set_option('display.max_colwidth', None)

df = pd.read_csv('./Laptrack_1.csv')
df.shape

(4551, 23)

In [39]:
df.columns

Index(['brand', 'laptop_model_name', 'laptop_model_number', 'processor_brand',
       'processor_model', 'storage_type', 'operating_system',
       'display_resolution', 'extracted_rating', 'battery_life_hours_upto',
       'price', 'stock', 'time_of_extraction', 'url', 'source',
       'storage_capacity_gb', 'display_size_inches', 'ram_gb', 'no_of_reviews',
       'laptop_dimensions', 'laptop_weight_pounds', 'image_src', 'title'],
      dtype='object')

In [40]:
sorted(map(str, df['brand'].unique()))

['ACEMAGIC',
 'ANPCOWER',
 'ASUS',
 'AXL',
 'Acer',
 'AimCare',
 'Akocrsiy',
 'Alienware',
 'ApoloSign',
 'Apple',
 'Avita',
 'BiTECOOL',
 'CHUWI',
 'Colorful',
 'Coolby',
 'Dell',
 'ECOHERO',
 'FUNYET',
 'GIGABYTE',
 'Gateway',
 'Getac',
 'Gina Joyfurno',
 'HP',
 'IJKKJI',
 'INHONLAP',
 'Infinix',
 'KAIGERR',
 'KOOFORWAY',
 'LETSUNG',
 'LG',
 'Lapbook',
 'Lenovo',
 'MSI',
 'Maxsignage',
 'Microsoft',
 'Molegar',
 'NIAKUN',
 'Naclud',
 'Nmybwo',
 'Oemgenuine',
 'Panasonic',
 'Primebook',
 'Razer',
 'SAINTDISE',
 'SAIWAN',
 'SAMSUNG',
 'SGIN',
 'THKGRCE',
 'TPV',
 'Thomson',
 'ULTIMUS',
 'VGKE',
 'Vaio',
 'WINGS',
 'WIPEMIK',
 'ZEBRONICS',
 'ZENAERO',
 'ZHAOHUIXIN',
 'Zuleisy',
 'ist computers',
 'jumper',
 'nan',
 'realme',
 'walker']

# SPACY NER

In [41]:
import spacy
from spacy import displacy

In [42]:
nlp = spacy.load('en_core_web_lg')
nlp

<spacy.lang.en.English at 0x14b920e50>

In [43]:
doc = nlp('Donal Trump was the President of USA')
doc

Donal Trump was the President of USA

In [44]:
type(doc)

spacy.tokens.doc.Doc

In [45]:
doc.ents

(Donal Trump, USA)

In [46]:
doc.ents[0],type(doc.ents[0])

(Donal Trump, spacy.tokens.span.Span)

In [47]:
displacy.render(doc,style="ent",jupyter=True)

In [48]:
df.head(1)['title']

0    TPV 15.6" Laptop Computer (Intel Core i5 / 16GB RAM/ 512GB SSD), MS Office 2024, FHD Display with 100% sRGB Color Gamut, Windows 11 Pro Notebook PC with Dual Band Wi-Fi, Webcam (Silver)
Name: title, dtype: object

In [49]:
displacy.render(nlp(df.iloc[10]['title']),style="ent",jupyter=True)

In [50]:
df['brand'].isna().sum()

np.int64(1)

In [51]:
title = df[df['brand'].isna()]['url']
title

3553    https://www.bestbuy.com/site/15-6-full-hd-touch-screen-laptop-intel-core-i7-16gb-memory-512gb-ssd-natural-silver/6587202.p?skuId=6587202
Name: url, dtype: object

In [55]:
df[df['source'] == 'Amazon'].head()['laptop_model_number']

0    AceBook_Silver
1           660 G11
2                HP
3        82R400EMUS
4       G614JV-AS74
Name: laptop_model_number, dtype: object

In [73]:
import pandas as pd
from collections import defaultdict


# Step 1: Clean the column
df=df.copy()
df['laptop_model_number_temp'] = df['laptop_model_number'].fillna('').str.strip().str.lower()

# Step 2: Check for substrings and group
groups = defaultdict(list)  # To store groups of indices
visited = set()  # Keep track of visited rows

for i, model_i in enumerate(df['laptop_model_number_temp']):
    if i in visited:
        continue
    group = {i}  # Initialize the group with the current index
    for j, model_j in enumerate(df['laptop_model_number_temp']):
        if i != j and (model_i in model_j or model_j in model_i):
            group.add(j)
    visited.update(group)
    groups[f"group_{len(groups) + 1}"] = list(group)

# Step 3: Map groups back to the DataFrame
df['group'] = None
for group_name, indices in groups.items():
    df.loc[indices, 'group'] = group_name

# View the DataFrame with groups
df.head(5)

Unnamed: 0,brand,laptop_model_name,laptop_model_number,processor_brand,processor_model,storage_type,operating_system,display_resolution,extracted_rating,battery_life_hours_upto,price,stock,time_of_extraction,url,source,storage_capacity_gb,display_size_inches,ram_gb,no_of_reviews,laptop_dimensions,laptop_weight_pounds,image_src,title,laptop_model_number_temp,group
0,TPV,AceBook,AceBook_Silver,Intel,Core i5,SSD,Windows 11 Pro,1920 x 1080 Pixels,4.4,5.0,309.99,True,2024-12-05 12:15:37,https://www.amazon.com/sspa/click?ie=UTF8&spc=MToyNTU2ODExODc2NTY2MDA3OjE3MjY5OTc5MTM6c3BfYXRmX2Jyb3dzZTozMDAzMzE1MDUzNjc5MDI6OjA6Og&url=%2FTPV-Computer-Display-Windows-Notebook%2Fdp%2FB0D87RK5Q8%2Fref%3Dsr_1_2_sspa%3Fdib%3DeyJ2IjoiMSJ9.Mxv-LfaT1mRTkqi6GWEFXxFggO64cMc5a5WQAxAoDYKDc12AZYR8P_ulvGvs2fWDJ7_Nm3Q_vhpmjYCsv0OJPJs6Bo1FRX66cFxFfjDS5M6onhimzcAeCOZ3ganbR1ztxCB3tNO3H2yyijUubD6xTB3G5UxB2MqPQQaHrdLyLai29xSPy1hZkKf5Sm2MjOm9tSgk53w2mGq_T8vokhTRQYuN1uCwbBymaj5IEXp_6tzKZ-DZ8lOcjCmWHFWVn2fkST3y58q3_y3AxTbeKQumI-hyzZmMa7tonKGyaYVia00.eFMZpiqa4BbfF8ul13G1oFWFR-jOJGFy_-1DtQHxgmY%26dib_tag%3Dse%26qid%3D1726997913%26s%3Dpc%26sr%3D1-2-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGZfYnJvd3Nl%26psc%3D1,Amazon,512,15.6,16,52,14.09 x 8.97 x 0.86 inches,3.63,https://m.media-amazon.com/images/I/818cXcaog9L.__AC_SX300_SY300_QL70_FMwebp_.jpg,"TPV 15.6"" Laptop Computer (Intel Core i5 / 16GB RAM/ 512GB SSD), MS Office 2024, FHD Display with 100% sRGB Color Gamut, Windows 11 Pro Notebook PC with Dual Band Wi-Fi, Webcam (Silver)",acebook_silver,group_1
1,HP,Elitebook,660 G11,Intel,Core i7,SSD,Windows 11 Pro,1920 x 1200 Pixels,5.0,,1089.0,True,2024-12-05 12:15:41,https://www.amazon.com/sspa/click?ie=UTF8&spc=MToyNTU2ODExODc2NTY2MDA3OjE3MjY5OTc5MTM6c3BfYXRmX2Jyb3dzZTozMDAzODcyNTQ1NzUwMDI6OjA6Og&url=%2FHP-Elitebook-660-G11-Fingerprint%2Fdp%2FB0DFWDQS11%2Fref%3Dsr_1_3_sspa%3Fdib%3DeyJ2IjoiMSJ9.Mxv-LfaT1mRTkqi6GWEFXxFggO64cMc5a5WQAxAoDYKDc12AZYR8P_ulvGvs2fWDJ7_Nm3Q_vhpmjYCsv0OJPJs6Bo1FRX66cFxFfjDS5M6onhimzcAeCOZ3ganbR1ztxCB3tNO3H2yyijUubD6xTB3G5UxB2MqPQQaHrdLyLai29xSPy1hZkKf5Sm2MjOm9tSgk53w2mGq_T8vokhTRQYuN1uCwbBymaj5IEXp_6tzKZ-DZ8lOcjCmWHFWVn2fkST3y58q3_y3AxTbeKQumI-hyzZmMa7tonKGyaYVia00.eFMZpiqa4BbfF8ul13G1oFWFR-jOJGFy_-1DtQHxgmY%26dib_tag%3Dse%26qid%3D1726997913%26s%3Dpc%26sr%3D1-3-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGZfYnJvd3Nl%26psc%3D1,Amazon,2048,16.0,32,6,14.14 x 9.20 x 0.78 inches,3.85,https://m.media-amazon.com/images/I/71nSy8Uh8iL.__AC_SX300_SY300_QL70_FMwebp_.jpg,"HP Elitebook 660 G11 16"" FHD+ Business AI Laptop Computer, Intel Ultra 7 155U (Beat i7-1355U), 32GB DDR5 RAM, 1TB PCIe SSD, WiFi 6E, Backlit Keyboard, Fingerprint Reader, Windows 11 Pro Vent-HEA",660 g11,group_2
2,HP,HP Laptop,HP,Intel,Celeron N,SSD,Windows 11 Home,1366 x 768 Pixels,4.1,11.0,265.99,True,2024-12-05 12:15:53,https://www.amazon.com/HP-Students-Business-Quad-Core-Storage/dp/B0B2D77YB8/ref=sr_1_6?dib=eyJ2IjoiMSJ9.Mxv-LfaT1mRTkqi6GWEFXxFggO64cMc5a5WQAxAoDYKDc12AZYR8P_ulvGvs2fWDJ7_Nm3Q_vhpmjYCsv0OJPJs6Bo1FRX66cFxFfjDS5M6onhimzcAeCOZ3ganbR1ztxCB3tNO3H2yyijUubD6xTB3G5UxB2MqPQQaHrdLyLai29xSPy1hZkKf5Sm2MjOm9tSgk53w2mGq_T8vokhTRQYuN1uCwbBymaj5IEXp_6tzKZ-DZ8lOcjCmWHFWVn2fkST3y58q3_y3AxTbeKQumI-hyzZmMa7tonKGyaYVia00.eFMZpiqa4BbfF8ul13G1oFWFR-jOJGFy_-1DtQHxgmY&dib_tag=se&qid=1726997913&s=pc&sr=1-6,Amazon,64,14.0,8,878,8.86 x 12.76 x 0.71 inches,3.24,https://m.media-amazon.com/images/I/81divYKpeTL.__AC_SY300_SX300_QL70_FMwebp_.jpg,"HP 14"" Ultral Light Laptop for Students and Business, Intel Quad-Core, 8GB RAM, 192GB Storage(64GB eMMC+128GB Ghost Manta SD Card), 1 Year Office 365, USB C, Win 11 S",hp,group_3
3,Lenovo,"Ideapad 1 15"" - 82R400EMUS",82R400EMUS,AMD,Ryzen 5,SSD,Windows 11 Home,1920x1080 Pixels,4.4,10.0,379.0,True,2024-12-05 12:15:57,https://www.amazon.com/Lenovo-IdeaPad-Ryzen5-5500U-1920x1080-Storage/dp/B0CJB5N9BQ/ref=sr_1_7?dib=eyJ2IjoiMSJ9.Mxv-LfaT1mRTkqi6GWEFXxFggO64cMc5a5WQAxAoDYKDc12AZYR8P_ulvGvs2fWDJ7_Nm3Q_vhpmjYCsv0OJPJs6Bo1FRX66cFxFfjDS5M6onhimzcAeCOZ3ganbR1ztxCB3tNO3H2yyijUubD6xTB3G5UxB2MqPQQaHrdLyLai29xSPy1hZkKf5Sm2MjOm9tSgk53w2mGq_T8vokhTRQYuN1uCwbBymaj5IEXp_6tzKZ-DZ8lOcjCmWHFWVn2fkST3y58q3_y3AxTbeKQumI-hyzZmMa7tonKGyaYVia00.eFMZpiqa4BbfF8ul13G1oFWFR-jOJGFy_-1DtQHxgmY&dib_tag=se&qid=1726997913&s=pc&sr=1-7,Amazon,512,15.6,8,639,9.29 x 14.17 x 0.70 inches,3.52,https://m.media-amazon.com/images/I/51h3oOo7XnL.__AC_SX300_SY300_QL70_FMwebp_.jpg,"Lenovo IdeaPad 1 Laptop, 15.6” FHD Display, AMD Ryzen 5 5500U, 8GB RAM, 512GB SSD, Windows 11 Home, 720p Camera w/Privacy Shutter, Smart Noise Cancelling, Cloud Grey",82r400emus,group_4
4,ASUS,Strix G16,G614JV-AS74,Intel,Core i7,SSD,Windows 11 Home,1366 x 768 pixels,4.3,,1267.46,True,2024-12-05 12:16:21,https://www.amazon.com/ASUS-ROG-Strix-Gaming-Laptop/dp/B0CRDCXRK2/ref=sr_1_8?dib=eyJ2IjoiMSJ9.Mxv-LfaT1mRTkqi6GWEFXxFggO64cMc5a5WQAxAoDYKDc12AZYR8P_ulvGvs2fWDJ7_Nm3Q_vhpmjYCsv0OJPJs6Bo1FRX66cFxFfjDS5M6onhimzcAeCOZ3ganbR1ztxCB3tNO3H2yyijUubD6xTB3G5UxB2MqPQQaHrdLyLai29xSPy1hZkKf5Sm2MjOm9tSgk53w2mGq_T8vokhTRQYuN1uCwbBymaj5IEXp_6tzKZ-DZ8lOcjCmWHFWVn2fkST3y58q3_y3AxTbeKQumI-hyzZmMa7tonKGyaYVia00.eFMZpiqa4BbfF8ul13G1oFWFR-jOJGFy_-1DtQHxgmY&dib_tag=se&qid=1726997913&s=pc&sr=1-8,Amazon,2048,16.0,16,473,13.94 x 0.89 x 10.39 inches,5.51,https://m.media-amazon.com/images/I/81GrCeuCzxL.__AC_SY300_SX300_QL70_FMwebp_.jpg,"ASUS ROG Strix G16 (2024) Gaming Laptop, 16” 16:10 FHD 165Hz Display, NVIDIA® GeForce RTX™ 4060, Intel Core i7-13650HX, 16GB DDR5, 1TB PCIe Gen4 SSD, Wi-Fi 6E, Windows 11, G614JV-AS74",g614jv-as74,group_5


In [64]:
len(groups.keys())

1623

In [74]:
# Find the group with size 3
group_with_size_3 = group_sizes[group_sizes == 5]

if not group_with_size_3.empty:
    group_name = group_with_size_3.index[1]  # Get the name of the group
    group_size = group_with_size_3.iloc[1]   # Get the size of the group

    # Extract rows belonging to the group with size 3
    group_rows = df[df['group'] == group_name]

    # Output the results
    print(f"Group with size 3: {group_name}")
    print(f"Group size: {group_size}")
    print("Rows in the group with size 3:")
    print(group_rows)
else:
    print("No group has size 3.")

Group with size 3: group_129
Group size: 5
Rows in the group with size 3:
       brand  \
222   Lenovo   
1254  Lenovo   
1505  Lenovo   
2537  Lenovo   
4299  Lenovo   

                                                             laptop_model_name  \
222                                                                       21JT   
1254                                                        ThinkPad E16 Gen 1   
1505                                                                      21JT   
2537                                                        ThinkPad E16 Gen 1   
4299  ThinkPad E16 Gen 1 16" Laptop - AMD Ryzen 5 with 16GB memory - 256GB SSD   

     laptop_model_number processor_brand processor_model storage_type  \
222                 21JT             AMD         Ryzen 5          SSD   
1254          21JT001PUS             AMD         Ryzen 5          SSD   
1505                21JT             AMD         Ryzen 5          SSD   
2537          21JT001PUS             AMD    

In [71]:
displacy.render(nlp(df.iloc[164]['title']),style="ent",jupyter=True)