In [29]:
#imports
import pandas as pd
import texthero as hero
from texthero import preprocessing
from texthero import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [30]:
# Read the CSV file 'music_review.csv' from the './Data/' directory into a DataFrame named 'music_review2'
music_review2 = pd.read_csv('./Data/music_review.csv')

In [31]:
music_review2.head()

Unnamed: 0.1,Unnamed: 0,overall,reviewerID,asin,reviewText
0,0,5,A1ZCPG3D3HGRSS,1388703,This is a great cd full of worship favorites!!...
1,1,5,AC2PL52NKPL29,1388703,"So creative! Love his music - the words, the ..."
2,2,5,A1SUZXBDZSDQ3A,1388703,"Keith Green, gone far to early in his carreer,..."
3,3,5,A3A0W7FZXM0IZW,1388703,Keith Green had his special comedy style of Ch...
4,4,5,A12R54MKO17TW0,1388703,Keith Green / So you wanna go back to Egypt......


grouby by asin and join review text 

In [32]:
# Convert 'reviewText' column to strings
music_review2['reviewText'] = music_review2['reviewText'].astype(str)

# Group by 'asin' and join review text
grouped_reviews = music_review2.groupby('asin')['reviewText'].agg(lambda x: ' '.join(x))

# Convert the result back to a DataFrame
grouped_reviews_df = pd.DataFrame(grouped_reviews).reset_index()


In [33]:
grouped_reviews_df.head()

Unnamed: 0,asin,reviewText
0,1377647,"If you're looking for a meditative, contemplat..."
1,1388703,This is a great cd full of worship favorites!!...
2,1526146,"This is music from my younger years that I, as..."
3,1527134,"Don Francisco's ""Early Works"" are filled with ..."
4,1529145,"Discovering older Christian music, inspiration..."


In [34]:
grouped_reviews_df.shape

(456811, 2)

In [35]:
grouped_reviews_df

Unnamed: 0,asin,reviewText
0,0001377647,"If you're looking for a meditative, contemplat..."
1,0001388703,This is a great cd full of worship favorites!!...
2,0001526146,"This is music from my younger years that I, as..."
3,0001527134,"Don Francisco's ""Early Works"" are filled with ..."
4,0001529145,"Discovering older Christian music, inspiration..."
...,...,...
456806,B01HJ91RWE,Love this group!
456807,B01HJ91TDQ,"This was the song as I've heard it on T.V., do..."
456808,B01HJ91VJ8,"This is a beautiful, worshipful song that glor..."
456809,B01HJ91WOW,"Awesome Love, love, love it Love It,Anytime..."


In [36]:
import string

def clean_review(df):
    # Define NLTK stopwords and Porter Stemmer
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    
    # Define custom preprocessing pipeline
    def custom_pipeline(text):
        # Lowercase
        text = text.lower()
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords and apply stemming
        tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
        
        # Join tokens back into text
        text = ' '.join(tokens)
        
        return text
    
    # Apply custom preprocessing pipeline to 'reviewText' column
    df['clean_text'] = df['reviewText'].apply(custom_pipeline)
    
    return df

In [37]:
clean_review(grouped_reviews_df)

Unnamed: 0,asin,reviewText,clean_text
0,0001377647,"If you're looking for a meditative, contemplat...",your look medit contempl tape perfect one bar ...
1,0001388703,This is a great cd full of worship favorites!!...,great cd full worship favorit time great keith...
2,0001526146,"This is music from my younger years that I, as...",music younger year musician use quit often chu...
3,0001527134,"Don Francisco's ""Early Works"" are filled with ...",francisco earli work fill uniqu sens passion l...
4,0001529145,"Discovering older Christian music, inspiration...",discov older christian music inspir beauti gif...
...,...,...,...
456806,B01HJ91RWE,Love this group!,love group
456807,B01HJ91TDQ,"This was the song as I've heard it on T.V., do...",song ive heard tv wish longer
456808,B01HJ91VJ8,"This is a beautiful, worshipful song that glor...",beauti worship song glorifi lord cant get enou...
456809,B01HJ91WOW,"Awesome Love, love, love it Love It,Anytime...",awesom love love love love itanytim


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the review text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Limiting to top 5000 features

tfidf_matrix = tfidf_vectorizer.fit_transform(grouped_reviews_df['clean_text'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the 'asin' column with the TF-IDF DataFrame
tfidf_df = pd.concat([grouped_reviews_df['asin'], tfidf_df], axis=1)

In [39]:
tfidf_df.head()

Unnamed: 0,asin,10,100,11,12,13,14,15,16,17,...,wrote,ye,yeah,year,york,youll,young,youth,youtub,youv
0,1377647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.067305,0.0,0.0,0.0,0.0,0.0,0.0
1,1388703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.05493,0.0,0.0,0.025288,0.0,0.0,0.0
2,1526146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032802,0.0,0.0,0.09856,0.0,0.0,0.054448,0.0,0.0,0.0
3,1527134,0.0,0.0,0.0,0.0,0.0,0.0,0.035368,0.0,0.0,...,0.0,0.0,0.0,0.18971,0.0,0.028909,0.0,0.0,0.0,0.034821
4,1529145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
music_meta = pd.read_csv('./Data/music_meta.csv')
music_meta.drop(columns =['Unnamed: 0'], inplace=True)

In [41]:
music_meta.head()

Unnamed: 0,description,title,brand,asin,style
0,Unknown,Master Collection Volume One,John Michael Talbot,1377647,Audio CD
1,Unknown,Hymns Collection: Hymns 1 &amp; 2,Second Chapter of Acts,1529145,Audio CD
2,Unknown,Early Works - Don Francisco,Don Francisco,1527134,Audio CD
3,Unknown,So You Wanna Go Back to Egypt,Keith Green,1388703,Audio CD
4,"[""1. Losing Game 2. I Can't Wait 3. Didn't He ...",Early Works - Dallas Holm,Dallas Holm,1526146,Audio CD


In [42]:
tfidf_df.set_index('asin', inplace=True)
music_meta.set_index('asin', inplace=True)

In [44]:
content_model = tfidf_df.join(music_meta['style'], on='asin', rsuffix='_music_meta')

In [46]:
content_model.head()

Unnamed: 0_level_0,10,100,11,12,13,14,15,16,17,20,...,ye,yeah,year,york,youll,young,youth,youtub,youv,style_music_meta
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1377647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.067305,0.0,0.0,0.0,0.0,0.0,0.0,Audio CD
1388703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05493,0.0,0.0,0.025288,0.0,0.0,0.0,Audio CD
1526146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.09856,0.0,0.0,0.054448,0.0,0.0,0.0,Audio CD
1527134,0.0,0.0,0.0,0.0,0.0,0.0,0.035368,0.0,0.0,0.034105,...,0.0,0.0,0.18971,0.0,0.028909,0.0,0.0,0.0,0.034821,Audio CD
1529145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Audio CD


In [None]:
content_model = pd.get_dummies(content_model, columns=['style_music_meta'])


In [None]:
content_model.info()