In [1]:
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import pandas as pd
import numpy as np

In [3]:
gunpla_df = pd.read_csv('gunpla.csv')

In [4]:
gunpla_df.head()

Unnamed: 0,bandai_id,title,price,url,jan_code,release_date,category,series,item_type,manufacturer,item_size_and_weight,year_of_release,month_of_release,item_size,weight,month_year,length_cm,breadth_cm,height_cm,area_cm2
0,ACS0005,1/100 G (Gundam) Robot Antenna,28.7,https://www.hlj.com/1-100-scale-g-gundam-robot...,2000008000000.0,2003-10-06,Gundam,,Detailing Kits/Accessories,Acu Stion,8.2 x 6 x 0.2 cm / 10g,2003.0,October,8.2 x 6 x 0.2 cm,10g,October 2003,8.2,6.0,0.2,9.84
1,BAN01301,1/144 GM Cannon,11.71,https://www.hlj.com/1-144-scale-gm-cannon-ban0...,4902425000000.0,1994-01-01,Gundam,Gundam M.S.V.,Other Gundam Kits,Bandai,14.3 x 20 x 4.7 cm / 130g,1994.0,January,14.3 x 20 x 4.7 cm,130g,January 1994,14.3,20.0,4.7,1344.2
2,BAN01302,1/1200 Magellan,14.64,https://www.hlj.com/1-1200-scale-magellan-ban0...,4573103000000.0,1994-01-01,Gundam,Gundam 0079,Other Gundam Kits,Bandai,18 x 25.5 x 5 cm / 150g,1994.0,January,18 x 25.5 x 5 cm,150g,January 1994,18.0,25.5,5.0,2295.0
3,BAN01303,1/144 Zaku II High Mobility Type,14.64,https://www.hlj.com/1-144-scale-zaku-ii-high-m...,4902425000000.0,1994-01-01,Gundam,Gundam M.S.V.,Other Gundam Kits,Bandai,18 x 25.5 x 5 cm / 180g,1994.0,January,18 x 25.5 x 5 cm,180g,January 1994,18.0,25.5,5.0,2295.0
4,BAN01304,1/144 Zaku Cannon,14.64,https://www.hlj.com/1-144-scale-zaku-cannon-ba...,4902425000000.0,1994-01-01,Gundam,Gundam M.S.V.,Other Gundam Kits,Bandai,18 x 25.5 x 5 cm / 180g,1994.0,January,18 x 25.5 x 5 cm,180g,January 1994,18.0,25.5,5.0,2295.0


In [5]:
gunpla_df.isnull().sum()

bandai_id                 0
title                     6
price                     6
url                       0
jan_code                155
release_date              6
category                  6
series                   19
item_type                 6
manufacturer              6
item_size_and_weight    197
year_of_release           6
month_of_release          6
item_size               197
weight                  197
month_year                6
length_cm               197
breadth_cm              197
height_cm               197
area_cm2                197
dtype: int64

In [6]:
gunpla_df = gunpla_df.dropna(subset=['title'])

In [7]:
gunpla_df['series'] = gunpla_df['series'].fillna('Gundam')

In [8]:
gunpla_df = gunpla_df.drop(['jan_code'],axis=1)

In [9]:
gunpla_df = gunpla_df[['title','category','release_date',
'series', 'item_type', 'manufacturer']]

In [10]:
gunpla_df.head()

Unnamed: 0,title,category,release_date,series,item_type,manufacturer
0,1/100 G (Gundam) Robot Antenna,Gundam,2003-10-06,Gundam,Detailing Kits/Accessories,Acu Stion
1,1/144 GM Cannon,Gundam,1994-01-01,Gundam M.S.V.,Other Gundam Kits,Bandai
2,1/1200 Magellan,Gundam,1994-01-01,Gundam 0079,Other Gundam Kits,Bandai
3,1/144 Zaku II High Mobility Type,Gundam,1994-01-01,Gundam M.S.V.,Other Gundam Kits,Bandai
4,1/144 Zaku Cannon,Gundam,1994-01-01,Gundam M.S.V.,Other Gundam Kits,Bandai


In [11]:
gunpla_df['year_of_release'] = pd.to_datetime(gunpla_df['release_date']).dt.year

In [12]:
gunpla_df['feature_combine'] = gunpla_df[['series','item_type','year_of_release']].apply(lambda row : ' '.join(row.astype(str)),axis=1)

In [13]:
gunpla_df['feature_combine'].unique()

array(['Gundam Detailing Kits/Accessories 2003',
       'Gundam M.S.V. Other Gundam Kits 1994',
       'Gundam 0079 Other Gundam Kits 1994',
       'Gundam 0079 Other Gundam Kits 1999',
       'Gundam M.S.V. Other Gundam Kits 1985',
       'Zeta Gundam Other Gundam Kits 1994',
       'Zeta Gundam Other Gundam Kits 2006',
       'Zeta Gundam Other Gundam Kits 1997',
       'Gundam ZZ Other Gundam Kits 1994',
       'Gundam ZZ Other Gundam Kits 1997',
       'Gundam 0079 Other Gundam Kits 1998',
       "Char's Counterattack Other Gundam Kits 1994",
       'Builders Parts Detailing Kits/Accessories 2015',
       'SD Gundam High Grade Kits 1994',
       'Gundam Sentinel Other Gundam Kits 1994',
       'Gundam 0080 High Grade Kits 1994',
       'SD Gundam High Grade Kits 2002', 'SD Gundam High Grade Kits 2001',
       'SD Gundam High Grade Kits 2003', 'SD Gundam High Grade Kits 2000',
       'Gundam 0079 High Grade Kits 1994',
       'Zeta Gundam High Grade Kits 1994',
       'SD Gundam Hig

In [13]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(2,3))
tfidf_features = tfidf.fit_transform(gunpla_df['feature_combine'])
feature_names = tfidf.get_feature_names_out()

In [14]:
gunpla_recs = pd.DataFrame(tfidf_features.toarray(), index=gunpla_df.title, columns = feature_names)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity_array = cosine_similarity(gunpla_recs)
cosine_df = pd.DataFrame(cosine_similarity_array, index = gunpla_recs.index)

In [16]:
cosine_df.shape

(3482, 3482)

In [17]:
import pickle
with open('gundam_recs.pkl','wb') as f:
    pickle.dump(cosine_df,f)

In [18]:
gunpla_recs.loc['1/100 Aegis Gundam'].values.reshape(1,-1)

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [19]:
cosine_similarity(gunpla_recs.loc["1/144 RG RX-0 Unicorn Gundam"].values.reshape(1,-1),
                  gunpla_recs.loc["1/144 RG RX-0 Unicorn Gundam"].values.reshape(1,-1))

array([[1.]])

In [20]:
cosine_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,3472,3473,3474,3475,3476,3477,3478,3479,3480,3481
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1/100 G (Gundam) Robot Antenna,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1/144 GM Cannon,0.0,1.0,0.432812,1.0,1.0,0.432812,0.070156,0.070156,1.0,1.0,...,0.430333,0.430333,0.0,0.0,0.363856,0.430333,0.430333,0.0,0.036895,0.036895
1/1200 Magellan,0.0,0.432812,1.0,0.432812,0.432812,1.0,0.6315,0.6315,0.432812,0.432812,...,0.064597,0.064597,0.0,0.0,0.054618,0.064597,0.064597,0.0,0.028253,0.028253
1/144 Zaku II High Mobility Type,0.0,1.0,0.432812,1.0,1.0,0.432812,0.070156,0.070156,1.0,1.0,...,0.430333,0.430333,0.0,0.0,0.363856,0.430333,0.430333,0.0,0.036895,0.036895
1/144 Zaku Cannon,0.0,1.0,0.432812,1.0,1.0,0.432812,0.070156,0.070156,1.0,1.0,...,0.430333,0.430333,0.0,0.0,0.363856,0.430333,0.430333,0.0,0.036895,0.036895


In [21]:
sr = cosine_df.loc["1/144 RG NU Gundam"]
sr = list(enumerate(sr))
sr_sorted = sorted(sr,key = lambda x:x[1],reverse=True)
sr_sorted = sr_sorted[1:6]
item_indices = [i[0] for i in sr_sorted]
cosine_df.iloc[item_indices].index

Index(['1/144 RG Nu Gundam Fin Funnel Effect Set', '1/144 RG Sazabi',
       '1/144 RG Hi-Nu Gundam', '1/144 HG Gundam G40 (Industrial Design Ver.)',
       '1/100 Hi-Resolution Model God Gundam'],
      dtype='object', name='title')

In [22]:
new_cos_df = cosine_df.drop(["1/144 RG NU Gundam","1/144 RG RX-0 Unicorn Gundam"],axis=0)
# cosine_similarity(new_cos_df,new_vals)

In [23]:
user_vals = cosine_df.loc[["1/144 RG NU Gundam","1/144 RG RX-0 Unicorn Gundam"]].mean().values.reshape(1,-1)

In [24]:
non_user_gunpla = cosine_df.drop(["1/144 RG NU Gundam","1/144 RG RX-0 Unicorn Gundam"],axis=0)

In [25]:
new_sim = cosine_similarity(user_vals, non_user_gunpla)

In [26]:
new_sim_df = pd.DataFrame(new_sim.T, index = non_user_gunpla.index, columns = ['Similarity Score'])

In [27]:
new_sim_df.sort_values(by='Similarity Score',ascending=False)[:10]

Unnamed: 0_level_0,Similarity Score
title,Unnamed: 1_level_1
"1/144 RG Unicorn Gundam (Premium ""Unicorn Mode"" Box)",0.873189
1/144 RG Unicorn Gundam (Bande Dessinee Ver.),0.763531
1/144 RG Unicorn Gundam 02 Banshee Norn,0.763531
1/144 RG Full Armor Unicorn Gundam,0.763531
"1/144 RG Unicorn Gundam 02 Banshee Norn (Premium ""Unicorn Mode"" Box)",0.763531
1/144 RG Nu Gundam Fin Funnel Effect Set,0.758872
1/144 RG MS-06R-2 Johnny Ridden Custom Zaku II,0.681944
1/144 RG Sazabi,0.628984
1/144 HGUC Narrative Gundam C-Packs,0.609391
1/144 HGUC Gustav Karl (Unicorn Ver.),0.609391


In [28]:
# gunpla_sim = pd.crosstab(gunpla_df['title'], gunpla_df['item_type'])

In [29]:
# lupus = gunpla_sim.loc["1/144 HG Gundam Vidar"].values
# bael = gunpla_sim.loc["1/144 HG Gundam Bael"].values

In [30]:
# jaccard_dist = pdist(gunpla_sim.values, metric='jaccard')
# jaccard_dist

In [31]:
# square_jaccard = squareform(jaccard_dist)

In [32]:
# jaccard_sim = 1 - square_jaccard

In [33]:
# distance_df = pd.DataFrame(jaccard_sim, index = gunpla_sim.index, columns = gunpla_sim.index)

In [34]:
# distance_df["1/144 HG Gundam Vidar"]["1/144 RG God Gundam"]