In [12]:
%%time

import pandas as pd
import gcsfs
fs = gcsfs.GCSFileSystem()

path = f'gs://amazon-home-and-kitchen/full_train_data_txt_processed.csv'

columns_to_read = ['main_category', 'title_x', 'rating', 'processed_text']
df = pd.read_csv(path, na_values=['—']
                ,usecols=columns_to_read 
                ,nrows=250000)

df.head()


CPU times: user 20.1 s, sys: 1.43 s, total: 21.5 s
Wall time: 42.1 s


Unnamed: 0,main_category,title_x,rating,processed_text
0,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,nice
1,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,chair beautiful bought 4 black color stylish c...
2,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,1,horible build quality stool dont lock place si...
3,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,great quality especially pricebr br ive swivel...
4,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,great price worked perfectly small space


In [13]:
df.shape

(250000, 4)

In [14]:
df['review_id'] = df.index 
df = df.dropna(subset=['processed_text'])

In [5]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt') 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def custom_tokenizer(text):
        # Remove numbers and unwanted words (like stopwords)
    tokens = re.findall(r'\b[a-zA-Z]+\b', text.lower())  # Only keep alphabetic words
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

def get_top_words(row):
        top_words = row.nlargest(3)  # Get the top 3 words based on TF-IDF score
        return top_words.index.tolist() if len(top_words) == 3 else [None, None, None]

def get_topics(df):

    dfc = df.copy()
    # Custom tokenizer function to filter out numbers and stopwords


    # Step 1: Preprocess `processed_text` by removing low-quality words
    dfc.loc[:, 'filtered_text'] = dfc['processed_text'].apply(custom_tokenizer)

    # Step 2: Define the TF-IDF vectorizer and fit it to the filtered text
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(dfc['filtered_text'])

    # Step 3: Create a DataFrame of the TF-IDF scores
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    # Step 4: Define a function to get the top 3 words for each review


    # Step 5: Apply the function to each row to get the top 3 words
    dfc[['topic_1', 'topic_2', 'topic_3']] = tfidf_df.apply(get_top_words, axis=1, result_type="expand")

    # Step 6: Filter out rows where any of the top topics are None
    # dfc = dfc.dropna(subset=['topic_1', 'topic_2', 'topic_3'])
    final_df = dfc[['review_id', 'main_category', 'title_x', 'rating', 'filtered_text', 'topic_1', 'topic_2', 'topic_3']]
    
    return final_df


In [16]:
%%time

final = get_topics(df)
final.shape

CPU times: user 5min 39s, sys: 13.3 s, total: 5min 52s
Wall time: 5min 52s


(249624, 8)

In [20]:
final.head()

Unnamed: 0,review_id,main_category,title_x,rating,filtered_text,topic_1,topic_2,topic_3
0,0,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,nice,nice,aa,aaa
1,1,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,chair beautiful bought black color stylish con...,contemporary,stylish,solid
2,2,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,1,horible build quality stool dont lock place si...,sit,full,compress
3,3,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,great quality especially pricebr br ive swivel...,swivel,stool,making
4,4,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,great price worked perfectly small space,worked,space,perfectly


In [21]:

from google.cloud import storage
bucket_name = 'amazon-home-and-kitchen'
destination_blob_name = 'topic_2024-11-13.csv'

# Save DataFrame as CSV locally first
final.to_csv('/tmp/topic_2024-11-13.csv', index=False)

# Initialize a GCS client and upload wwthe file
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename('/tmp/topic_2024-11-13.csv')