In [49]:
import pandas as pd
import numpy as np


In [55]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your Excel file
df = pd.read_excel("kaggle-prenew.xlsx")

# Convert dataset names to string to avoid errors
df['Dataset_name'] = df['Dataset_name'].astype(str)

# Vectorize using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Dataset_name'])

# Compute similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create name to index mapping
name_to_index = pd.Series(df.index, index=df['Dataset_name']).drop_duplicates()

# Function to recommend similar datasets
def recommend_similar_datasets(title, num_results=5):
    if title not in name_to_index:
        return f"❌ Dataset '{title}' not found."
    
    idx = name_to_index[title]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_results+1]
    dataset_indices = [i[0] for i in sim_scores]
    
    return df[['Dataset_name', 'Dataset_link']].iloc[dataset_indices].reset_index(drop=True)

# Example usage
recommendations = recommend_similar_datasets("Hotel Reservations Dataset", 5)
print(recommendations)


                  Dataset_name  \
0  A hotel's customers dataset   
1         hotel recommendation   
2                 Hotel review   
3       Hotel Booking Dataset    
4               Hotel Booking    

                                        Dataset_link  
0  https://www.kaggle.com/datasets/nantonio/a-hot...  
1  https://www.kaggle.com/datasets/keshavramaiah/...  
2  https://www.kaggle.com/datasets/anu0012/hotel-...  
3  https://www.kaggle.com/datasets/mukuldeshantri...  
4  https://www.kaggle.com/datasets/mojtaba142/hot...  


In [58]:
import pickle

# Save vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)


In [59]:
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset again
df = pd.read_excel("kaggle-prenew.xlsx")
df['Dataset_name'] = df['Dataset_name'].astype(str)

# Load saved TF-IDF
with open("tfidf_vectorizer.pkl", "rb") as f:
    tfidf = pickle.load(f)

# Transform data
tfidf_matrix = tfidf.transform(df['Dataset_name'])

# Compute similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Save both
with open("df.pkl", "wb") as f:
    pickle.dump(df, f)

with open("cosine_sim.pkl", "wb") as f:
    pickle.dump(cosine_sim, f)

print("✅ df.pkl and cosine_sim.pkl saved successfully!")

✅ df.pkl and cosine_sim.pkl saved successfully!


In [6]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load dataset
df = pd.read_excel("kaggle-prenew.xlsx")   # Replace with your file name

# Step 2: Preprocessing (fill missing values)
df['Dataset_name'] = df['Dataset_name'].fillna("")

# Step 3: Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Dataset_name'].astype(str))


# Step 4: Prepare model dictionary
model = {
    'tfidf': tfidf_vectorizer,
    'df': df
}

# Step 5: Save the model as model.pkl
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("✅ model.pkl created successfully! (TF-IDF + DataFrame only)")


✅ model.pkl created successfully! (TF-IDF + DataFrame only)


In [1]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load model.pkl
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

# Step 2: Extract vectorizer and dataframe
tfidf_vectorizer = model['tfidf']
df = model['df']

# Step 3: User input
user_input = "clothing size fashion wear shopping"
# Step 4: Transform input into TF-IDF vector
user_vector = tfidf_vectorizer.transform([user_input])

# Step 5: Compute cosine similarity with all dataset entries
dataset_vectors = tfidf_vectorizer.transform(df['Dataset_name'].astype(str))
similarities = cosine_similarity(user_vector, dataset_vectors)

# Step 6: Get top 5 similar datasets
top_n = 5
top_indices = similarities[0].argsort()[-top_n:][::-1]

# Step 7: Display results
print("Top matches:\n")
for i in top_indices:
    print(f"→ {df.iloc[i]['Dataset_name']} (Similarity: {similarities[0][i]:.4f})")


Top matches:

→ Fashion Clothing Products Dataset (Similarity: 0.5050)
→ Ajio Fashion Clothing  (Similarity: 0.4502)
→ Shopping (Similarity: 0.4427)
→ Clothing Fit Dataset for Size Recommendation (Similarity: 0.4218)
→ Fashion Dataset (Similarity: 0.3790)


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pickle

df = pd.read_excel("kaggle-prenew.xlsx")
df["Dataset_name"] = df["Dataset_name"].fillna("")

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df["Dataset_name"].astype(str))

model = {
    "df": df,
    "tfidf": tfidf    # ✅ Save vectorizer, not the matrix
}

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ model.pkl saved successfully.")


✅ model.pkl saved successfully.
