# Data Preparation

In [42]:
import nltk 
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data Collection

In [43]:
metadata = pd.read_csv('devices_configuration.csv')
metadata.head()

Unnamed: 0,Company,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,OS
0,Acer,Aspire 3,Laptop,15.6,1366x768,Intel Core i3 7130U 2.7GHz,4GB,1TB HDD,Linux
1,Acer,Aspire A515-51G,Laptop,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,4GB,256GB SSD,Windows
2,Acer,Aspire A515-51G,Laptop,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,256GB SSD,Windows
3,Acer,Aspire 3,Laptop,15.6,1366x768,Intel Core i3 7100U 2.4GHz,4GB,1TB HDD,Windows
4,Acer,Aspire E5-475,Laptop,14.0,1366x768,Intel Core i3 6006U 2GHz,8GB,1TB HDD,Windows


In [44]:
#summarize of the data structure and content
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1924 entries, 0 to 1923
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      1924 non-null   object 
 1   Model Name   1924 non-null   object 
 2   Category     1924 non-null   object 
 3   Screen Size  1924 non-null   float64
 4   Screen       1924 non-null   object 
 5   CPU          1924 non-null   object 
 6   RAM          1924 non-null   object 
 7   Storage      1924 non-null   object 
 8   OS           1924 non-null   object 
dtypes: float64(1), object(8)
memory usage: 135.4+ KB


In [45]:
#print dimensions of metadata, returns rows and columns
print('Rows x Columns : ', metadata.shape[0], 'x', metadata.shape[1])
#print all the columns name
print('\nFeatures : ', metadata.columns.tolist())
#print number of unique values
print('\nUnique vlaues:')
print(metadata.nunique())

Rows x Columns :  1924 x 9

Features :  ['Company', 'Model Name', 'Category', 'Screen Size', 'Screen', 'CPU', 'RAM', 'Storage', 'OS']

Unique vlaues:
Company          38
Model Name     1295
Category          7
Screen Size     100
Screen          110
CPU             369
RAM              16
Storage          57
OS               72
dtype: int64


# Data Preprocessing

In [46]:
metadata = metadata[['Company','Model Name','Category','CPU','Storage','OS']]
metadata

Unnamed: 0,Company,Model Name,Category,CPU,Storage,OS
0,Acer,Aspire 3,Laptop,Intel Core i3 7130U 2.7GHz,1TB HDD,Linux
1,Acer,Aspire A515-51G,Laptop,Intel Core i5 8250U 1.6GHz,256GB SSD,Windows
2,Acer,Aspire A515-51G,Laptop,Intel Core i7 8550U 1.8GHz,256GB SSD,Windows
3,Acer,Aspire 3,Laptop,Intel Core i3 7100U 2.4GHz,1TB HDD,Windows
4,Acer,Aspire E5-475,Laptop,Intel Core i3 6006U 2GHz,1TB HDD,Windows
...,...,...,...,...,...,...
1919,Samsung,Samsung Galaxy Tab S5e Tablet,Tablet,Qualcomm Snapdragon 670,64GB,Android 9.0
1920,Samsung,Samsung Galaxy Tab S4 SM-T830 Tablet,Tablet,Qualcomm Snapdragon 835,64GB,Android 8.1
1921,Samsung,Samsung Galaxy Tab A 10.1,Tablet,Exynos 7904,32GB,Android 9.0
1922,Swipe,Swipe Slice Tablet,Tablet,MediaTek MT8312,4GB,Android 4.2.2


In [47]:
#Splitting Strings in each rows into lists
metadata['Company'] = metadata['Company'].map(lambda x: x.split(' '))
metadata['Category'] = metadata['Category'].map(lambda x: x.split(' '))
metadata['CPU'] = metadata['CPU'].map(lambda x: x.split(' ')[:3])
metadata['Storage'] = metadata['Storage'].map(lambda x: x.split(' ')[:3])
metadata['OS'] = metadata['OS'].map(lambda x: x.split(' '))

#converting to lowercase and removing spaces
for index, row in metadata.iterrows():
    row['Company'] = [x.lower().replace(' ','') for x in row['Company']]
    row['Category'] = [x.lower().replace(' ','') for x in row['Category']]
    row['CPU'] = [x.lower().replace(' ','') for x in row['CPU']]
    row['Storage'] = [x.lower().replace(' ','') for x in row['Storage']]
    row['OS'] = [x.lower().replace(' ','') for x in row['OS']]
    
metadata

Unnamed: 0,Company,Model Name,Category,CPU,Storage,OS
0,[Acer],Aspire 3,[Laptop],"[Intel, Core, i3]","[1TB, HDD]",[Linux]
1,[Acer],Aspire A515-51G,[Laptop],"[Intel, Core, i5]","[256GB, SSD]",[Windows]
2,[Acer],Aspire A515-51G,[Laptop],"[Intel, Core, i7]","[256GB, SSD]",[Windows]
3,[Acer],Aspire 3,[Laptop],"[Intel, Core, i3]","[1TB, HDD]",[Windows]
4,[Acer],Aspire E5-475,[Laptop],"[Intel, Core, i3]","[1TB, HDD]",[Windows]
...,...,...,...,...,...,...
1919,[Samsung],Samsung Galaxy Tab S5e Tablet,[Tablet],"[Qualcomm, Snapdragon, 670]",[64GB],"[Android, 9.0]"
1920,[Samsung],Samsung Galaxy Tab S4 SM-T830 Tablet,[Tablet],"[Qualcomm, Snapdragon, 835]",[64GB],"[Android, 8.1]"
1921,[Samsung],Samsung Galaxy Tab A 10.1,[Tablet],"[Exynos, 7904]",[32GB],"[Android, 9.0]"
1922,[Swipe],Swipe Slice Tablet,[Tablet],"[MediaTek, MT8312]",[4GB],"[Android, 4.2.2]"


In [48]:
# Define the columns to concatenate
columns = ['Company', 'Category', 'CPU', 'Storage', 'OS']

# Create the 'Bag_of_words' column by concatenating specified columns
metadata['Bag_of_words'] = metadata[columns].apply(
    lambda row: ' '.join([' '.join(col) for col in row if col]).strip(),
    axis=1
)

# Select only the 'Model Name' and 'Bag_of_words' columns
metadata = metadata[['Model Name', 'Bag_of_words']]

# Display the updated DataFrame
metadata

Unnamed: 0,Model Name,Bag_of_words
0,Aspire 3,Acer Laptop Intel Core i3 1TB HDD Linux
1,Aspire A515-51G,Acer Laptop Intel Core i5 256GB SSD Windows
2,Aspire A515-51G,Acer Laptop Intel Core i7 256GB SSD Windows
3,Aspire 3,Acer Laptop Intel Core i3 1TB HDD Windows
4,Aspire E5-475,Acer Laptop Intel Core i3 1TB HDD Windows
...,...,...
1919,Samsung Galaxy Tab S5e Tablet,Samsung Tablet Qualcomm Snapdragon 670 64GB An...
1920,Samsung Galaxy Tab S4 SM-T830 Tablet,Samsung Tablet Qualcomm Snapdragon 835 64GB An...
1921,Samsung Galaxy Tab A 10.1,Samsung Tablet Exynos 7904 32GB Android 9.0
1922,Swipe Slice Tablet,Swipe Tablet MediaTek MT8312 4GB Android 4.2.2


In [49]:
#convert a collection of text documents into a matrix of token couts 
count=CountVectorizer()
#learns the vocabulary and converts the text data into a matrix of token counts
count_matrix=count.fit_transform(metadata['Bag_of_words'])
count_matrix

<1924x377 sparse matrix of type '<class 'numpy.int64'>'
	with 15355 stored elements in Compressed Sparse Row format>

In [50]:
#calculate cosine similarity
#Use it to find items that are similiar
cosine_sim = cosine_similarity(count_matrix, count_matrix)
print(cosine_sim)

[[1.         0.5        0.5        ... 0.         0.         0.13363062]
 [0.5        1.         0.875      ... 0.         0.         0.26726124]
 [0.5        0.875      1.         ... 0.         0.         0.26726124]
 ...
 [0.         0.         0.         ... 1.         0.33333333 0.3086067 ]
 [0.         0.         0.         ... 0.33333333 1.         0.3086067 ]
 [0.13363062 0.26726124 0.26726124 ... 0.3086067  0.3086067  1.        ]]


In [53]:
#contain the model names as it values and will inherit the index
indices = pd.Series(metadata['Model Name'])
indices[:5]

0           Aspire 3
1    Aspire A515-51G
2    Aspire A515-51G
3           Aspire 3
4      Aspire E5-475
Name: Model Name, dtype: object

In [56]:
#two argument, one of title of item for which you want, the other is find the similar matrix
def recommend(title,cosine_sim = cosine_sim):
    recommended_devices=[]
    #find the index of the provided 'title' in the 'indices'
    idx=indices[indices == title].index[0] 
    #calculate similarity scores
    score_series=pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    #get top 15 similar items
    top_15_indices=list(score_series.iloc[1:16].index)     
    for i in top_15_indices: 
        recommended_devices.append(list(metadata['Model Name'])[i])
    
    return recommended_devices

In [58]:
#call the recommend function with title 
recommend('Aspire 3')

['Aspire A315-51',
 'Aspire 5',
 'Aspire E5-576G',
 'Inspiron 3567',
 'Aspire E5-575',
 'Vostro 3568',
 'Vostro 3568',
 'Aspire E5-774G',
 'E5 774G',
 'Aspire ES1-572',
 'Inspiron 3567',
 'Aspire A315-51',
 'Aspire 7',
 'Aspire A515-51G-32MX',
 'Spin 3']

In [60]:
#Input Part
#Prompt the user for a title 
title = input('Please enter the title: ')

#Use the title in your function
recommended_devices = recommend(title)

#Print the list of recommended devices
print(f"Recommended devices for '{title}':'")
for device in recommended_devices:
    print(device)

Please enter the title: Aspire 3
Recommended devices for 'Aspire 3':'
Aspire A315-51
Aspire 5
Aspire E5-576G
Inspiron 3567
Aspire E5-575
Vostro 3568
Vostro 3568
Aspire E5-774G
E5 774G
Aspire ES1-572
Inspiron 3567
Aspire A315-51
Aspire 7
Aspire A515-51G-32MX
Spin 3
