<a href="https://colab.research.google.com/github/Rifthi-tech/recommendation_project/blob/main/Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setting Up the Environment

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

# Success Message
print("✅ Environment setup completed successfully!")


✅ Environment setup completed successfully!


# 2. Import data

In [None]:
try:
    # Replace with your actual file path
    file_path = '/content/drive/MyDrive/Recommendation/test_data.csv'
    df = pd.read_csv(file_path)

    # Display first few rows
    print("Dataset loaded successfully. Here are the first few rows:")
    display(df.head())

    # Basic dataset info
    print("\nDataset Info:")
    print(f"- Total records: {len(df)}")
    print(f"- Columns: {df.columns.tolist()}")
    print(f"- Missing values:\n{df.isnull().sum()}")

    # Success Message
    print(f"\n✅ Data loading completed successfully! Loaded {len(df)} records.")
except Exception as e:
    print(f"❌ Error loading dataset: {str(e)}")


Dataset loaded successfully. Here are the first few rows:


Unnamed: 0,product_id,title,product_description,rating,initial_price,images,product_details,product_specifications,category
0,8376765,Lino Perros,Women Navy Blue Solid Backpack,3.8,3995,http://assets.myntassets.com/assets/images/837...,"{""description"":""Navy Blue solid backpackNon-Pa...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks
1,9136281,Tommy Hilfiger,Unisex Navy Blue Striped Backpack,4.5,2899,http://assets.myntassets.com/assets/images/913...,"{""description"":""Navy Blue backpackNon-Padded h...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks
2,17633752,Lavie,Aries Women Pink Mini Backpack,4.4,2999,http://assets.myntassets.com/assets/images/176...,"{""description"":""Pink solid backpacks Non-padd...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks
3,1376949,F Gear,Unisex Navy & Grey Printed Burner Backpack,4.4,1675,http://assets.myntassets.com/assets/images/137...,"{""description"":""Navy and grey printed backpack...","[{""specification_name"":""Compartment Closure"",""...",backpacks
4,13939916,MYTRIDENT,Men Blue Solid Bath Robe,4.7,2899,http://assets.myntassets.com/assets/images/pro...,"{""description"":""Blue solid bath robe, has a sh...","[{""specification_name"":""Body or Garment Size"",...",bath-robe



Dataset Info:
- Total records: 1000
- Columns: ['product_id', 'title', 'product_description', 'rating', 'initial_price', 'images', 'product_details', 'product_specifications', 'category']
- Missing values:
product_id                0
title                     0
product_description       0
rating                    0
initial_price             0
images                    0
product_details           0
product_specifications    0
category                  0
dtype: int64

✅ Data loading completed successfully! Loaded 1000 records.


# 3. Data Cleaning

In [None]:
def clean_data(df):
    # Make a copy of the original dataframe to avoid SettingWithCopyWarning
    clean_df = df.copy()

    # 1. Remove duplicate rows
    initial_count = len(clean_df)
    clean_df.drop_duplicates(inplace=True)
    duplicates_removed = initial_count - len(clean_df)

    # 2. Handle missing values - first check if columns exist
    # Remove products with no rating if column exists
    if 'rating' in clean_df.columns:
        clean_df = clean_df[clean_df['rating'].notna()]
        # Remove products with rating = 0
        clean_df = clean_df[clean_df['rating'] != 0]

    # Remove products with no image if column exists
    if 'images' in clean_df.columns:
        clean_df = clean_df[clean_df['images'].notna()]

    # Fill other missing values where appropriate
    text_columns = ['product_description', 'product_details', 'product_specifications']
    for col in text_columns:
        if col in clean_df.columns:
            clean_df[col].fillna(f'No {col} available', inplace=True)

    # 3. Convert data types if needed
    if 'rating' in clean_df.columns:
        clean_df['rating'] = pd.to_numeric(clean_df['rating'], errors='coerce')
    if 'initial_price' in clean_df.columns:
        clean_df['price'] = pd.to_numeric(clean_df['initial_price'], errors='coerce')

    # 4. Remove rows with null values in critical columns
    critical_columns = ['product_id', 'title', 'category', 'rating', 'price']
    # Only check columns that exist in the dataframe
    existing_critical_cols = [col for col in critical_columns if col in clean_df.columns]
    if existing_critical_cols:
        clean_df = clean_df.dropna(subset=existing_critical_cols)

    # Reset index after cleaning
    clean_df.reset_index(drop=True, inplace=True)

    # Print cleaning report
    print(f"🔧 Data Cleaning Report:")
    print(f"- Removed {duplicates_removed} duplicate rows")
    if 'rating' in df.columns:
        zero_ratings_removed = initial_count - duplicates_removed - len(clean_df)
        print(f"- Removed {zero_ratings_removed} rows with rating = 0")
    print(f"- Final dataset contains {len(clean_df)} records after cleaning")

    # Show columns with remaining missing values (if any)
    missing_values = clean_df.isnull().sum()
    if missing_values.sum() > 0:
        print("\n⚠️ Remaining missing values after cleaning:")
        print(missing_values[missing_values > 0])

    return clean_df

# Example usage (assuming df exists):
try:
    # Clean the data
    cleaned_df = clean_data(df)

    # Display all rows and columns
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)

    # Print cleaned data
    print("\nCleaned Data Sample:")
    if not cleaned_df.empty:
        display(cleaned_df.head())
    else:
        print("Warning: Cleaned DataFrame is empty!")

    # Success Message
    print("✅ Data cleaning completed successfully!")

except NameError:
    print("❌ Error: 'df' is not defined. Please load your DataFrame first.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {str(e)}")


🔧 Data Cleaning Report:
- Removed 0 duplicate rows
- Removed 114 rows with rating = 0
- Final dataset contains 886 records after cleaning

Cleaned Data Sample:


Unnamed: 0,product_id,title,product_description,rating,initial_price,images,product_details,product_specifications,category,price
0,8376765,Lino Perros,Women Navy Blue Solid Backpack,3.8,3995,http://assets.myntassets.com/assets/images/837...,"{""description"":""Navy Blue solid backpackNon-Pa...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks,3995
1,9136281,Tommy Hilfiger,Unisex Navy Blue Striped Backpack,4.5,2899,http://assets.myntassets.com/assets/images/913...,"{""description"":""Navy Blue backpackNon-Padded h...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks,2899
2,17633752,Lavie,Aries Women Pink Mini Backpack,4.4,2999,http://assets.myntassets.com/assets/images/176...,"{""description"":""Pink solid backpacks Non-padd...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks,2999
3,1376949,F Gear,Unisex Navy & Grey Printed Burner Backpack,4.4,1675,http://assets.myntassets.com/assets/images/137...,"{""description"":""Navy and grey printed backpack...","[{""specification_name"":""Compartment Closure"",""...",backpacks,1675
4,13939916,MYTRIDENT,Men Blue Solid Bath Robe,4.7,2899,http://assets.myntassets.com/assets/images/pro...,"{""description"":""Blue solid bath robe, has a sh...","[{""specification_name"":""Body or Garment Size"",...",bath-robe,2899


✅ Data cleaning completed successfully!


#4.Feature Extraction

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

def extract_features(df):
    try:
        # Check if input DataFrame is empty
        if df.empty:
            raise ValueError("Input DataFrame is empty")

        # Initialize feature containers
        features = None

        # 1. Focus on rating features (primary feature)
        if 'rating' in df.columns:
            # Create rating features matrix
            features = df['rating'].values.reshape(-1, 1)

            # Normalize ratings
            scaler = MinMaxScaler()
            features = scaler.fit_transform(features)

            print("\n📊 Feature Extraction Report:")
            print("- Primary feature: rating (normalized)")
            print(f"- Feature matrix shape: {features.shape}")
        else:
            raise ValueError("No 'rating' column found for feature extraction")

        # 2. Text features using TF-IDF (secondary feature)
        text_features = None
        if 'product_description' in df.columns:
            tfidf = TfidfVectorizer(stop_words='english', max_features=500)
            text_features = tfidf.fit_transform(df['product_description'])
            print("- Secondary features: text features extracted from descriptions")
        else:
            print("ℹ️ Info: 'product_description' column not found - skipping text features")

        return features, text_features, tfidf

    except Exception as e:
        print(f"❌ Error in feature extraction: {str(e)}")
        return None, None, None

# Example usage with more robust handling
try:
    if 'cleaned_df' not in globals():
        raise NameError("'cleaned_df' not found. Please run data cleaning first.")

    # Extract features
    features, text_features, tfidf = extract_features(cleaned_df)

    if features is not None:
        # Success Message
        print("\n✅ Feature extraction completed successfully!")

        # Optional: Show feature shapes
        print("\nFeature Shapes:")
        print(f"- Rating features: {features.shape}")
        if text_features is not None:
            print(f"- Text features: {text_features.shape}")
    else:
        print("❌ Feature extraction failed - no features could be extracted")

except NameError as ne:
    print(f"❌ Error: {str(ne)}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {str(e)}")



📊 Feature Extraction Report:
- Primary feature: rating (normalized)
- Feature matrix shape: (886, 1)
- Secondary features: text features extracted from descriptions

✅ Feature extraction completed successfully!

Feature Shapes:
- Rating features: (886, 1)
- Text features: (886, 500)


# 5. Model Training

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
import numpy as np

def train_models(df, features, text_features=None):
    try:
        # Split data into train and test
        train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

        # 1. KNN Model for high-rating recommendations
        print("\n🏗️ Training K-Nearest Neighbors Model for High-Rating Recommendations...")

        # Combine rating features with text features (if available)
        if text_features is not None:
            # Convert text features to dense array if sparse
            if hasattr(text_features, 'toarray'):
                text_features_array = text_features.toarray()
            else:
                text_features_array = text_features

            # Weight rating features more heavily (70% rating, 30% text)
            combined_features = np.hstack([
                features * 0.7,  # Rating features with higher weight
                text_features_array * 0.3  # Text features with lower weight
            ])
        else:
            combined_features = features  # Use only rating features

        # Train KNN model
        knn_model = NearestNeighbors(n_neighbors=min(20, len(df)), metric='cosine')
        knn_model.fit(combined_features)

        print("✅ KNN model trained successfully for high-rating recommendations!")

        return {
            'knn_model': knn_model,
            'combined_features': combined_features,
            'tfidf': tfidf if 'tfidf' in globals() else None
        }
    except Exception as e:
        print(f"❌ Error in model training: {str(e)}")
        return None

# Train all models
models = train_models(cleaned_df, features, text_features)
if models is not None:
    print("\n🎉 Machine learning model trained successfully for high-rating recommendations!")
else:
    print("\n❌ Model training failed")



🏗️ Training K-Nearest Neighbors Model for High-Rating Recommendations...
✅ KNN model trained successfully for high-rating recommendations!

🎉 Machine learning model trained successfully for high-rating recommendations!


# 6. Recommendation Functions

In [None]:
def get_recommendations(product_id, top_n=10):
    try:
        # Find the index of the product
        product_match = cleaned_df[cleaned_df['product_id'] == product_id]

        if len(product_match) == 0:
            raise ValueError(f"Product with ID '{product_id}' not found in database")

        idx = product_match.index[0]

        # Get the combined features for the query product
        query_features = models['combined_features'][idx:idx+1]

        # Find nearest neighbors (products with similar high ratings)
        distances, indices = models['knn_model'].kneighbors(query_features, n_neighbors=top_n+1)

        # Get the recommended products (excluding the query product itself)
        similar_indices = indices.flatten()[1:]  # exclude the product itself
        recommendations = cleaned_df.iloc[similar_indices]

        # Sort by rating (descending) to ensure highest ratings come first
        recommendations = recommendations.sort_values('rating', ascending=False)

        # Display recommendations
        print(f"✨ High-Rating Recommendations for product: {product_match.iloc[0]['title']}")
        print(f"⭐ Average Rating: {product_match.iloc[0]['rating']}")
        print("\nTop Recommendations (Highest Rated Similar Products):")

        display_cols = ['product_id', 'title', 'category', 'rating', 'price']
        display_cols = [col for col in display_cols if col in recommendations.columns]

        if len(display_cols) > 0:
            display(recommendations[display_cols].head(top_n))
        else:
            display(recommendations.iloc[:, :min(5, recommendations.shape[1])].head(top_n))

        return recommendations

    except Exception as e:
        print(f"❌ Error generating recommendations: {str(e)}")
        return None

# Test the recommendation function
try:
    # Get a sample high-rated product ID
    sample_product = cleaned_df.sort_values('rating', ascending=False).iloc[0]['product_id']

    print(f"\nTesting recommendation system with high-rated product ID: {sample_product}")
    get_recommendations(sample_product, top_n=10)

    print("\n✅ Recommendation function working successfully for high-rating products!")
except Exception as e:
    print(f"❌ Error testing recommendation function: {str(e)}")



Testing recommendation system with high-rated product ID: 21427828
✨ High-Rating Recommendations for product: London Rag
⭐ Average Rating: 5.0

Top Recommendations (Highest Rated Similar Products):


Unnamed: 0,product_id,title,category,rating,price
331,19008496,Calvin Klein Jeans,jeans,5.0,7999
466,15880308,Sirona,shaving-essentials,4.9,294
40,21850852,Soie,briefs,4.9,390
332,18992754,AMERICAN EAGLE OUTFITTERS,jeans,4.8,4499
579,18947600,KASSUALLY,shorts,4.7,1899
38,16696094,Van Heusen,briefs,4.6,649
343,18903330,Levis,jeans,4.4,3299
807,17628712,Roadster,track-pants,4.3,1799
814,17647372,bebe,trousers,4.2,2899
19,1728397,Mactree,boots,3.9,9060



✅ Recommendation function working successfully for high-rating products!


# 7.Export data

In [None]:
import os

def export_trained_data(df, models):
    try:
        # Create directory if it doesn't exist
        save_dir = '/content/drive/MyDrive/Recommendation/'
        os.makedirs(save_dir, exist_ok=True)

        # 1. Save cleaned product data
        product_data_path = os.path.join(save_dir, 'cleaned_products.csv')
        df.to_csv(product_data_path, index=False)

        # 2. Save KNN recommendations for all products
        if 'knn_model' in models:
            # Get recommendations for all products
            distances, indices = models['knn_model'].kneighbors(models['combined_features'])

            # Create a DataFrame with recommendations
            recommendations_list = []
            for i in range(len(df)):
                product_id = df.iloc[i]['product_id']
                similar_indices = indices[i][1:]  # exclude the product itself
                similar_products = df.iloc[similar_indices]

                # Sort by rating and get top recommendations
                top_recommendations = similar_products.sort_values('rating', ascending=False)

                # Store recommendations
                for j, (_, rec) in enumerate(top_recommendations.iterrows()):
                    recommendations_list.append({
                        'product_id': product_id,
                        'recommended_product_id': rec['product_id'],
                        'recommended_product_title': rec['title'],
                        'recommended_product_rating': rec['rating'],
                        'rank': j+1,
                        'similarity_score': 1 - distances[i][j+1]  # Convert distance to similarity
                    })

            # Convert to DataFrame and save
            recommendations_df = pd.DataFrame(recommendations_list)
            recommendations_path = os.path.join(save_dir, 'high_rating_recommendations.csv')
            recommendations_df.to_csv(recommendations_path, index=False)

        # Print export report
        print("\n📦 Exported Files:")
        print(f"- Cleaned products: cleaned_products.csv")
        print(f"- High-rating recommendations: high_rating_recommendations.csv")

        print("\n✅ All data exported successfully to CSV files!")

    except Exception as e:
        print(f"❌ Error exporting data: {str(e)}")

# Export all data
export_trained_data(cleaned_df, models)



📦 Exported Files:
- Cleaned products: cleaned_products.csv
- High-rating recommendations: high_rating_recommendations.csv

✅ All data exported successfully to CSV files!
