<a href="https://colab.research.google.com/github/Thavatharani06/231801180_PODS/blob/main/ex4(b).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1 & 2: Setup
!pip install spacy scikit-learn pandas --quiet
!python -m spacy download en_core_web_sm

import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Step 3: Load and Prepare Dataset

print("\nSTEP 3: Load and Prepare Dataset")
data = {
    'Text': [
        "Absolutely wonderful - silky and sexy and comfortable.",
        "Love this dress! it's sooo pretty.",
        "I had to return it - the fit was just not right.",
        "Terrible quality. Do not recommend.",
        "Fast shipping and good packaging, but the product is bad.",
        "The color is not the same as shown in the picture."
    ]
}
df = pd.DataFrame(data)
print("Original Dataset:\n", df, "\n")

# Step 3a: Remove missing records
df.dropna(subset=['Text'], inplace=True)

# Step 3b: Limit records
df = df.head(1000)
print("Cleaned Dataset (after removing missing):\n", df, "\n")



STEP 3: Load and Prepare Dataset
Original Dataset:
                                                 Text
0  Absolutely wonderful - silky and sexy and comf...
1                 Love this dress! it's sooo pretty.
2   I had to return it - the fit was just not right.
3                Terrible quality. Do not recommend.
4  Fast shipping and good packaging, but the prod...
5  The color is not the same as shown in the pict... 

Cleaned Dataset (after removing missing):
                                                 Text
0  Absolutely wonderful - silky and sexy and comf...
1                 Love this dress! it's sooo pretty.
2   I had to return it - the fit was just not right.
3                Terrible quality. Do not recommend.
4  Fast shipping and good packaging, but the prod...
5  The color is not the same as shown in the pict... 



In [None]:
# Step 4: Preprocess Text Using spaCy

print("STEP 4: Text Preprocessing using spaCy")

def spacy_preprocess(text):
    print(f"\nOriginal Text: {text}")
    # a. Lowercase
    text = text.lower()
    print("a. Lowercased:", text)

    # b. Tokenize
    doc = nlp(text)
    print("b. Tokens:", [token.text for token in doc])

    # c. Filter out non-alpha
    tokens = [token for token in doc if token.is_alpha]
    print("c. Alphabetic Tokens:", [t.text for t in tokens])

    # d. Remove stopwords
    tokens = [token for token in tokens if not token.is_stop]
    print("d. Stopword Removed:", [t.text for t in tokens])

    # e. Lemmatize
    lemmas = [token.lemma_ for token in tokens]
    print("e. Lemmatized Tokens:", lemmas)

    # f. Join into final string
    cleaned = ' '.join(lemmas)
    print("f. Final Cleaned String:", cleaned)

    return cleaned

df['cleaned'] = df['Text'].apply(spacy_preprocess)

print("\nFinal Cleaned Dataset:\n", df[['Text', 'cleaned']], "\n")




STEP 4: Text Preprocessing using spaCy

Original Text: Absolutely wonderful - silky and sexy and comfortable.
a. Lowercased: absolutely wonderful - silky and sexy and comfortable.
b. Tokens: ['absolutely', 'wonderful', '-', 'silky', 'and', 'sexy', 'and', 'comfortable', '.']
c. Alphabetic Tokens: ['absolutely', 'wonderful', 'silky', 'and', 'sexy', 'and', 'comfortable']
d. Stopword Removed: ['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']
e. Lemmatized Tokens: ['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']
f. Final Cleaned String: absolutely wonderful silky sexy comfortable

Original Text: Love this dress! it's sooo pretty.
a. Lowercased: love this dress! it's sooo pretty.
b. Tokens: ['love', 'this', 'dress', '!', 'it', "'s", 'sooo', 'pretty', '.']
c. Alphabetic Tokens: ['love', 'this', 'dress', 'it', 'sooo', 'pretty']
d. Stopword Removed: ['love', 'dress', 'sooo', 'pretty']
e. Lemmatized Tokens: ['love', 'dress', 'sooo', 'pretty']
f. Final Cleaned String: love dr

In [None]:
# Step 5: Convert to TF-IDF

print("STEP 5: Convert Reviews to TF-IDF Vectors")
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['cleaned'])
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)
print("TF-IDF Feature Names:", vectorizer.get_feature_names_out(), "\n")



STEP 5: Convert Reviews to TF-IDF Vectors
TF-IDF Matrix Shape: (6, 24)
TF-IDF Feature Names: ['absolutely' 'bad' 'color' 'comfortable' 'dress' 'fast' 'fit' 'good'
 'love' 'packaging' 'picture' 'pretty' 'product' 'quality' 'recommend'
 'return' 'right' 'sexy' 'shipping' 'show' 'silky' 'sooo' 'terrible'
 'wonderful'] 



In [None]:
# STEP 6 & 7: Process Query and Get Matches
def process_query(query, top_k=3):
    print(f"\n\n🔍 Query: '{query}'")

    # Preprocess the query
    cleaned_query = spacy_preprocess(query)
    print("Preprocessed Query:", cleaned_query)

    # Convert to TF-IDF
    query_vector = vectorizer.transform([cleaned_query])

    # Compute cosine similarity
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get top-k indices
    top_indices = similarity_scores.argsort()[::-1][:top_k]
    results = df.iloc[top_indices].copy()
    results['Similarity Score'] = similarity_scores[top_indices]

    print("\nTop Matching Reviews:")
    return results[['Text', 'Similarity Score']]

# example queries
result1 = process_query("pretty and comfortable dress", top_k=3)
print(result1)

result2 = process_query("bad quality product", top_k=3)
print(result2)



🔍 Query: 'pretty and comfortable dress'

Original Text: pretty and comfortable dress
a. Lowercased: pretty and comfortable dress
b. Tokens: ['pretty', 'and', 'comfortable', 'dress']
c. Alphabetic Tokens: ['pretty', 'and', 'comfortable', 'dress']
d. Stopword Removed: ['pretty', 'comfortable', 'dress']
e. Lemmatized Tokens: ['pretty', 'comfortable', 'dress']
f. Final Cleaned String: pretty comfortable dress
Preprocessed Query: pretty comfortable dress

Top Matching Reviews:
                                                Text  Similarity Score
1                 Love this dress! it's sooo pretty.          0.577350
0  Absolutely wonderful - silky and sexy and comf...          0.258199
4  Fast shipping and good packaging, but the prod...          0.000000


🔍 Query: 'bad quality product'

Original Text: bad quality product
a. Lowercased: bad quality product
b. Tokens: ['bad', 'quality', 'product']
c. Alphabetic Tokens: ['bad', 'quality', 'product']
d. Stopword Removed: ['bad', 'quality', 