In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("data/flipkart_com-ecommerce_sample.csv")

In [3]:
import re
# Define stop words list
stop_words = set([
    'the', 'is', 'in', 'and', 'to', 'a', 'with', 'of', 'for', 'on', 'as', 'by', 'at', 'from', 'or', 'an'
    # Add more common stop words as needed
])

# Preprocess text function
def preprocess_text(text, stop_words = stop_words):
    if isinstance(text, str):
        text = text.lower()  # Convert to lower case
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stop words
    else:
        text = ''  # Handle non-string values
    return text

# Preprocess category function
def preprocess_category(category_text, stop_words=stop_words):
    return preprocess_text(category_text, stop_words)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Preprocess text data (assuming preprocess_text and preprocess_category functions exist)
data['processed_description'] = data['description'].apply(preprocess_text)
data['processed_category'] = data['product_category_tree'].apply(preprocess_category)

# Combine description and category into a single text field
data['combined_text'] = data['processed_description'] + ' ' + data['processed_category']


In [6]:
# Vectorize the combined text
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['combined_text'])

In [7]:
def vectorize_query(query, vectorizer):
    query = preprocess_text(query, stop_words)
    return vectorizer.transform([query])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

def search_ml(query, X, vectorizer):
    query_vector = vectorize_query(query, vectorizer)
    similarities = cosine_similarity(query_vector, X).flatten()
    return similarities


In [9]:
def get_top_results(similarities, top_n=10):
    ranked_indices = similarities.argsort()[-top_n:][::-1]
    return ranked_indices

# Retrieve and display product details
def display_top_results(indices, data, top_n=10):
    results_df = data.loc[indices[:top_n]]
    results_df.reset_index(drop=True, inplace=True)  # Reset index to sequential
    
    return results_df[['product_name', 'product_url', 'retail_price', 'discounted_price']]


In [10]:
# Example search query
query = "Knife"
similarities = search_ml(query, X, vectorizer)
top_indices = get_top_results(similarities, top_n=10)

# Retrieve and display product details
top_results = display_top_results(top_indices, data, top_n=10)
top_results

Unnamed: 0,product_name,product_url,retail_price,discounted_price
0,Hariom enterprises Stainless Steel Knife,http://www.flipkart.com/hariom-enterprises-sta...,1499.0,450.0
1,Tia by Ten on Ten Fashion Women's Push-up Bra,http://www.flipkart.com/tia-ten-fashion-women-...,1199.0,270.0
2,Imported Disposable Waxing Strips With Spatula...,http://www.flipkart.com/imported-disposable-wa...,599.0,499.0
3,Tirupati Knife Rocker Pizza Cutter,http://www.flipkart.com/tirupati-knife-rocker-...,1018.0,649.0
4,Pedrini Steel Waiters Corkscrew,http://www.flipkart.com/pedrini-steel-waiters-...,300.0,300.0
5,PeepalComm Multi Purpose 11 Function Multi Uti...,http://www.flipkart.com/peepalcomm-multi-purpo...,599.0,349.0
6,Stealodeal Grand Harvest 11 Function Multi Uti...,http://www.flipkart.com/stealodeal-grand-harve...,499.0,299.0
7,Checkered Chef Steel All-Purpose Scissor,http://www.flipkart.com/checkered-chef-steel-a...,2260.0,1921.0
8,www.thepaper.asia Floral Flower Art Canvas Pen...,http://www.flipkart.com/www-thepaper-asia-flor...,375.0,280.0
9,ETTI Glass Necklace,http://www.flipkart.com/etti-glass-necklace/p/...,1000.0,199.0


## Fine tuning

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example of fine-tuning parameters
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.85,  # Ignore terms that appear in more than 85% of documents
    min_df=2,     # Include terms that appear in at least 2 documents
    ngram_range=(1, 2),  # Use unigrams and bigrams
    max_features=5000    # Limit to the top 5000 features
)
X = vectorizer.fit_transform(data['combined_text'])


In [12]:
from gensim.models import Word2Vec

# Train your own Word2Vec model
sentences = [desc.split() for desc in data['processed_description']]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


In [13]:
import numpy as np

def vectorize_text(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Vectorize documents
doc_vectors = np.array([vectorize_text(desc, model) for desc in data['processed_description']])

# Vectorize query
def vectorize_query(query, model):
    return vectorize_text(preprocess_text(query, stop_words), model)

# Compute similarity
from sklearn.metrics.pairwise import cosine_similarity

def search_word2vec(query, doc_vectors, model):
    query_vector = vectorize_query(query, model)
    similarities = cosine_similarity([query_vector], doc_vectors).flatten()
    return similarities


In [14]:
import numpy as np

# Load GloVe embeddings
def load_glove_model(file_path):
    glove_model = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            glove_model[word] = vector
    return glove_model

glove_model = load_glove_model('path/to/glove.txt')

def vectorize_text_glove(text, glove_model):
    words = text.split()
    vectors = [glove_model.get(word, np.zeros(50)) for word in words]  # assuming 50 dimensions
    return np.mean(vectors, axis=0) if vectors else np.zeros(50)

# Vectorize documents
doc_vectors_glove = np.array([vectorize_text_glove(desc, glove_model) for desc in data['processed_description']])

# Vectorize query
def vectorize_query_glove(query, glove_model):
    return vectorize_text_glove(preprocess_text(query, stop_words), glove_model)

# Compute similarity
def search_glove(query, doc_vectors_glove, glove_model):
    query_vector = vectorize_query_glove(query, glove_model)
    similarities = cosine_similarity([query_vector], doc_vectors_glove).flatten()
    return similarities


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/glove.txt'

In [None]:
from gensim.models import Word2Vec
import numpy as np

# Load your preprocessed text data
data['processed_description'] = data['description'].apply(preprocess_text)

# Prepare sentences for training Word2Vec model
sentences = data['processed_description'].apply(lambda x: x.split()).tolist()

# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the model
model.save("word2vec_model")


In [None]:
def vectorize_text(text, model):
    """
    Convert text to a vector using Word2Vec embeddings.
    
    Parameters:
    - text: A string containing the processed text.
    - model: Trained Word2Vec model.
    
    Returns:
    - A numpy array representing the average vector of the words in the text.
    """
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Vectorize all product descriptions
doc_vectors = np.array([vectorize_text(desc, model) for desc in data['processed_description']])

def vectorize_query(query, model, stop_words):
    """
    Vectorize the search query after preprocessing it.
    
    Parameters:
    - query: Search query string.
    - model: Trained Word2Vec model.
    - stop_words: List of stop words to exclude.
    
    Returns:
    - A numpy array representing the average vector of the words in the query.
    """
    processed_query = preprocess_text(query, stop_words)
    return vectorize_text(processed_query, model)


In [None]:
# Define stop words
stop_words = set(['for', 'the', 'and', 'to', 'in', 'on', 'with', 'a', 'of', 'is', 'it', 'by', 'at', 'as', 'an', 'this', 'that'])

# Sample query
query = "shampoo for dog"

# Search for relevant products
ranked_indices, similarity_scores = search_word2vec(query, doc_vectors, model, data, stop_words)

# Retrieve top 10 product details
def display_top_results(ranked_indices, similarity_scores, data, top_n=10):
    """
    Display the top search results for a given query.
    
    Parameters:
    - ranked_indices: Indices of ranked product descriptions.
    - similarity_scores: Corresponding similarity scores.
    - data: DataFrame containing product details.
    - top_n: Number of top results to display.
    
    Returns:
    - DataFrame containing top N products sorted by similarity score.
    """
    results_df = data.iloc[ranked_indices[:top_n]]
    results_df['similarity_score'] = similarity_scores[:top_n]
    return results_df[['product_name', 'product_category', 'product_url', 'retail_price', 'discounted_price', 'similarity_score']]

# Display the top results
top_results = display_top_results(ranked_indices, similarity_scores, data, top_n=10)
top_results


In [None]:
# Preprocessing function
def preprocess_text(text, stop_words):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = ' '.join(word for word in text.split() if word not in stop_words)
    else:
        text = ''  # Handle non-string values
    return text

# Define stop words
stop_words = set(['for', 'the', 'and', 'to', 'in', 'on', 'with', 'a', 'of', 'is', 'it', 'by', 'at', 'as', 'an', 'this', 'that'])

# Apply preprocessing to both descriptions and categories
data['processed_description'] = data['description'].apply(lambda x: preprocess_text(x, stop_words))
data['processed_category'] = data['product_category_tree'].apply(lambda x: preprocess_text(x, stop_words))

# Prepare sentences for training Word2Vec model
sentences = data['processed_description'].apply(lambda x: x.split()).tolist()
sentences += data['processed_category'].apply(lambda x: x.split()).tolist()

# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the model
model.save("word2vec_model")


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize product names
vectorizer = TfidfVectorizer(stop_words='english')
product_vectors = vectorizer.fit_transform(data['product_name'])

def ml_search(search_term, data):
    # Vectorize search term
    search_vector = vectorizer.transform([search_term])
    
    # Compute cosine similarity
    similarities = cosine_similarity(search_vector, product_vectors).flatten()
    
    # Get top matching products
    top_indices = similarities.argsort()[-10:][::-1]
    results = data.iloc[top_indices]
    
    return results[['product_name', 'brand', 'retail_price', 'overall_rating']]

# Example search
search_term = "wireless earphones"
results = ml_search(search_term, data)
results.head()


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [16]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

# Fill or remove missing values
data['product_rating'].fillna(data['product_rating'].median(), inplace=True)
data['overall_rating'].fillna(data['overall_rating'].median(), inplace=True)
data['description'].fillna('', inplace=True)
data['product_specifications'].fillna('', inplace=True)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Convert data types if necessary
data['retail_price'] = data['retail_price'].astype(float)
data['discounted_price'] = data['discounted_price'].astype(float)


Missing Values:
 uniq_id                       2
crawl_timestamp               2
product_url                   2
product_name                  2
product_category_tree         2
pid                           2
retail_price                 80
discounted_price             80
image                         5
is_FK_Advantage_product       2
description                   4
product_rating                2
overall_rating                2
brand                      5866
product_specifications       16
processed_description         0
processed_category            0
combined_text                 0
dtype: int64


TypeError: could not convert string to float: 'No rating available'

In [17]:
import numpy as np

# Fill missing values in text fields with empty strings
text_columns = ['product_name', 'description', 'product_category_tree', 'brand']
data[text_columns] = data[text_columns].fillna('')

# Remove duplicates
data.drop_duplicates(inplace=True)

# Clean text fields
import re

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

data['product_name'] = data['product_name'].apply(clean_text)
data['description'] = data['description'].apply(clean_text)
data['product_category_tree'] = data['product_category_tree'].apply(clean_text)


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine relevant text fields into a single column for vectorization
data['text'] = data['product_name'] + ' ' + data['description'] + ' ' + data['product_category_tree']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'])

# Display the shape of the TF-IDF matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (20001, 5000)


In [24]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

data['tokens'] = data['text'].apply(word_tokenize)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get Word2Vec vectors for a product
def get_word2vec_vector(text):
    tokens = word_tokenize(text)
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if len(vectors) == 0:
        # Handle case where no tokens are in the vocabulary
        return np.zeros(word2vec_model.vector_size)
    vector = np.mean(vectors, axis=0)
    return vector

# Apply the function to get Word2Vec vectors
word2vec_vectors = np.array([get_word2vec_vector(text) for text in data['text']])

# Display the shape of the Word2Vec vectors
print("Word2Vec vectors shape:", word2vec_vectors.shape)

Word2Vec vectors shape: (20001, 100)


In [30]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_word2vec_vector(text, model):
    tokens = word_tokenize(text)
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if len(vectors) == 0:
        # Handle case where no tokens are in the vocabulary
        return np.zeros(model.vector_size)
    vector = np.mean(vectors, axis=0)
    return vector

def search_products_word2vec(query, embeddings, model, data, top_n=10):
    # Convert the query to a vector using the Word2Vec model
    query_embedding = get_word2vec_vector(query, model).reshape(1, -1)

    # Calculate cosine similarities between the query and product embeddings
    similarities = cosine_similarity(query_embedding, embeddings).flatten()

    # Get the indices of the top N most similar products
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Return the top N most similar products
    return data.iloc[top_indices][['product_name', 'brand', 'retail_price', 'overall_rating', 'product_url']]

# Example usage
search_term = "car"
results_word2vec = search_products_word2vec(search_term, word2vec_vectors, word2vec_model, data)
results_word2vec

Unnamed: 0,product_name,brand,retail_price,overall_rating,product_url
10286,himmlisch stmpl1525 car armrestbeige car armrest,HIMMLISCH,5999.0,No rating available,http://www.flipkart.com/himmlisch-stmpl1525-ca...
1510,accessoreez crtrtbl052 cup holder car tray table,ACCESSOREEZ,1200.0,No rating available,http://www.flipkart.com/accessoreez-crtrtbl052...
8113,vetra e00ty19 multifunction car tray for marut...,Vetra,799.0,No rating available,http://www.flipkart.com/vetra-e00ty19-multifun...
8094,vetra e00ty112 multifunction car tray for maru...,Vetra,799.0,No rating available,http://www.flipkart.com/vetra-e00ty112-multifu...
8123,vetra e00ty09 multifunction car tray for marut...,Vetra,799.0,No rating available,http://www.flipkart.com/vetra-e00ty09-multifun...
8114,vetra e00ty43 multifunction car tray for marut...,Vetra,799.0,No rating available,http://www.flipkart.com/vetra-e00ty43-multifun...
8104,vetra e00ty27 multifunction car tray for hyund...,Vetra,799.0,No rating available,http://www.flipkart.com/vetra-e00ty27-multifun...
8116,vetra e00ty104 multifunction car tray for tata...,Vetra,799.0,No rating available,http://www.flipkart.com/vetra-e00ty104-multifu...
8098,vetra e00ty71 multifunction car tray for hyund...,Vetra,799.0,No rating available,http://www.flipkart.com/vetra-e00ty71-multifun...
7515,packy poda car mat maruti alto,Packy Poda,2000.0,No rating available,http://www.flipkart.com/packy-poda-car-mat-mar...


False


AssertionError: Torch not compiled with CUDA enabled

In [6]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load the BERT model and tokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model = bert_model.to(device)

def get_bert_embeddings(texts, model, tokenizer, batch_size=64):
    embeddings = []
    for start in range(0, len(texts), batch_size):
        end = start + batch_size
        batch_texts = texts[start:end]
        # Tokenize texts and move tensors to GPU if available
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        # Take the mean of the token embeddings (CLS token can be used too)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)  # Stack arrays vertically

# Example usage
texts = ["This is a sample sentence.", "Here is another one."]
bert_embeddings = get_bert_embeddings(texts, bert_model, tokenizer)
print(bert_embeddings.shape)


KeyboardInterrupt: 

In [8]:
import torch
print(torch.cuda.is_available())  # Should print True if CUDA is available
print(torch.cuda.current_device())  # Prints the current device index
print(torch.cuda.get_device_name(torch.cuda.current_device()))  # Prints the name of the current device

False


AssertionError: Torch not compiled with CUDA enabled

In [6]:
import torch
torch.__version__

'1.13.1+cpu'