<a href="https://colab.research.google.com/github/Rifthi-tech/recommendation_project/blob/main/AI_Recommendation_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setting Up the Environment

In [12]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

# Success Message
print("✅ Environment setup completed successfully! All required packages are loaded and Google Drive is mounted.")

✅ Environment setup completed successfully! All required packages are loaded and Google Drive is mounted.


#2. Import data

In [13]:
# Load dataset from Google Drive
try:
    # Replace with your actual file path
    file_path = '/content/drive/MyDrive/AI-Recommendation_Project/test_data.csv'
    df = pd.read_csv(file_path)

    # Display first few rows
    print("Dataset loaded successfully. Here are the first few rows:")
    display(df.head())

    # Success Message
    print(f"✅ Data loading completed successfully! Loaded {len(df)} records.")
except Exception as e:
    print(f"❌ Error loading dataset: {str(e)}")

Dataset loaded successfully. Here are the first few rows:


Unnamed: 0,product_id,title,product_description,rating,initial_price,images,product_details,product_specifications,category
0,8376765,Lino Perros,Women Navy Blue Solid Backpack,3.8,3995,http://assets.myntassets.com/assets/images/837...,"{""description"":""Navy Blue solid backpackNon-Pa...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks
1,9136281,Tommy Hilfiger,Unisex Navy Blue Striped Backpack,4.5,2899,http://assets.myntassets.com/assets/images/913...,"{""description"":""Navy Blue backpackNon-Padded h...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks
2,17633752,Lavie,Aries Women Pink Mini Backpack,4.4,2999,http://assets.myntassets.com/assets/images/176...,"{""description"":""Pink solid backpacks Non-padd...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks
3,1376949,F Gear,Unisex Navy & Grey Printed Burner Backpack,4.4,1675,http://assets.myntassets.com/assets/images/137...,"{""description"":""Navy and grey printed backpack...","[{""specification_name"":""Compartment Closure"",""...",backpacks
4,13939916,MYTRIDENT,Men Blue Solid Bath Robe,4.7,2899,http://assets.myntassets.com/assets/images/pro...,"{""description"":""Blue solid bath robe, has a sh...","[{""specification_name"":""Body or Garment Size"",...",bath-robe


✅ Data loading completed successfully! Loaded 1000 records.


# 3. Data Cleaning

In [14]:
def clean_data(df):
    # Make a copy of the original dataframe to avoid SettingWithCopyWarning
    clean_df = df.copy()

    # 1. Remove duplicate rows
    initial_count = len(clean_df)
    clean_df.drop_duplicates(inplace=True)
    duplicates_removed = initial_count - len(clean_df)

    # 2. Handle missing values - first check if columns exist
    # Remove products with no rating if column exists
    if 'rating' in clean_df.columns:
        clean_df = clean_df[clean_df['rating'].notna()]
        # Remove products with rating = 0
        clean_df = clean_df[clean_df['rating'] != 0]

    # Remove products with no image if column exists
    if 'img' in clean_df.columns:
        clean_df = clean_df[clean_df['img'].notna()]

    # Fill other missing values where appropriate
    text_columns = ['description', 'productdetails', 'specification']
    for col in text_columns:
        if col in clean_df.columns:
            clean_df[col].fillna(f'No {col} available', inplace=True)

    # 3. Convert data types if needed
    if 'rating' in clean_df.columns:
        clean_df['rating'] = pd.to_numeric(clean_df['rating'], errors='coerce')
    if 'price' in clean_df.columns:
        clean_df['price'] = pd.to_numeric(clean_df['price'], errors='coerce')

    # 4. Remove rows with null values in critical columns
    critical_columns = ['id', 'name', 'category', 'rating', 'price']
    # Only check columns that exist in the dataframe
    existing_critical_cols = [col for col in critical_columns if col in clean_df.columns]
    if existing_critical_cols:
        clean_df = clean_df.dropna(subset=existing_critical_cols)

    # Reset index after cleaning
    clean_df.reset_index(drop=True, inplace=True)

    # Print cleaning report
    print(f"🔧 Data Cleaning Report:")
    print(f"- Removed {duplicates_removed} duplicate rows")
    if 'rating' in df.columns:
        zero_ratings_removed = initial_count - duplicates_removed - len(clean_df)
        print(f"- Removed {zero_ratings_removed} rows with rating = 0")
    print(f"- Final dataset contains {len(clean_df)} records after cleaning")

    # Show columns with remaining missing values (if any)
    missing_values = clean_df.isnull().sum()
    if missing_values.sum() > 0:
        print("\n⚠️ Remaining missing values after cleaning:")
        print(missing_values[missing_values > 0])

    return clean_df

# Example usage (assuming df exists):
try:
    # Clean the data
    cleaned_df = clean_data(df)

    # Display all rows and columns
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)

    # Print cleaned data
    print("\nCleaned Data Sample:")
    if not cleaned_df.empty:
        display(cleaned_df)
    else:
        print("Warning: Cleaned DataFrame is empty!")

    # Success Message
    print("✅ Data cleaning completed successfully!")

except NameError:
    print("❌ Error: 'df' is not defined. Please load your DataFrame first.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {str(e)}")

🔧 Data Cleaning Report:
- Removed 0 duplicate rows
- Removed 114 rows with rating = 0
- Final dataset contains 886 records after cleaning

Cleaned Data Sample:


Unnamed: 0,product_id,title,product_description,rating,initial_price,images,product_details,product_specifications,category
0,8376765,Lino Perros,Women Navy Blue Solid Backpack,3.8,3995,http://assets.myntassets.com/assets/images/837...,"{""description"":""Navy Blue solid backpackNon-Pa...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks
1,9136281,Tommy Hilfiger,Unisex Navy Blue Striped Backpack,4.5,2899,http://assets.myntassets.com/assets/images/913...,"{""description"":""Navy Blue backpackNon-Padded h...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks
2,17633752,Lavie,Aries Women Pink Mini Backpack,4.4,2999,http://assets.myntassets.com/assets/images/176...,"{""description"":""Pink solid backpacks Non-padd...","[{""specification_name"":""Add-Ons"",""specificatio...",backpacks
3,1376949,F Gear,Unisex Navy & Grey Printed Burner Backpack,4.4,1675,http://assets.myntassets.com/assets/images/137...,"{""description"":""Navy and grey printed backpack...","[{""specification_name"":""Compartment Closure"",""...",backpacks
4,13939916,MYTRIDENT,Men Blue Solid Bath Robe,4.7,2899,http://assets.myntassets.com/assets/images/pro...,"{""description"":""Blue solid bath robe, has a sh...","[{""specification_name"":""Body or Garment Size"",...",bath-robe
5,17198778,H&M,Beige & Tan Solid Stoneware Soap Dispenser,4.5,1399,http://assets.myntassets.com/assets/images/171...,"{""description"":""Soap dispenser in glazed stone...","[{""specification_name"":""Finish"",""specification...",bathroom-accessories
6,19851824,AVI Living,Grey Solid 500 GSM Pure Cotton Bath Towel,4.0,1399,http://assets.myntassets.com/assets/images/198...,"{""description"":""Set content: 1 Bath TowelFeatu...","[{""specification_name"":""Fabric"",""specification...",bath-towels
7,18602872,My Room,Yellow & Green Floral 140 TC King Bedsheet wit...,4.7,2999,http://assets.myntassets.com/assets/images/186...,"{""description"":""Set content: 1 king bedsheet w...","[{""specification_name"":""Bed Size"",""specificati...",bedsheets
8,18602850,My Room,Ethnic Print 144 TC Super King Bedsheet with 2...,4.5,2999,http://assets.myntassets.com/assets/images/186...,"{""description"":""Dimensions: Super King bedshee...","[{""specification_name"":""Bed Size"",""specificati...",bedsheets
9,8961147,Story@home,Pink & White Floral 160 TC Cotton 1 King Bedsh...,3.7,2199,http://assets.myntassets.com/assets/images/896...,"{""description"":""Set content: 1 King bedsheet w...","[{""specification_name"":""Bed Size"",""specificati...",bedsheets


✅ Data cleaning completed successfully!


#4.Feature Extraction


In [15]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

def extract_features(df):
    try:
        # Check if input DataFrame is empty
        if df.empty:
            raise ValueError("Input DataFrame is empty")

        # Initialize feature containers
        features = None

        # 1. Focus on rating features
        if 'rating' in df.columns:
            # Create rating features matrix
            features = df['rating'].values.reshape(-1, 1)

            # Normalize ratings
            scaler = MinMaxScaler()
            features = scaler.fit_transform(features)

            print("\n📊 Feature Extraction Report:")
            print("- Extracted features: rating (normalized)")
            print(f"- Final feature matrix shape: {features.shape}")
        else:
            raise ValueError("No 'rating' column found for feature extraction")

        # 2. Create category encoding (optional feature)
        if 'category' in df.columns:
            try:
                categories = pd.get_dummies(df['category'])
                print("- Additional category features added")
            except Exception as e:
                print(f"⚠️ Warning: Category processing failed - {str(e)}")
                categories = pd.DataFrame()
        else:
            print("ℹ️ Info: 'category' column not found - skipping category features")
            categories = pd.DataFrame()

        return features, categories

    except Exception as e:
        print(f"❌ Error in feature extraction: {str(e)}")
        return None, None

# Example usage with more robust handling
try:
    if 'cleaned_df' not in globals():
        raise NameError("'cleaned_df' not found. Please run data cleaning first.")

    # Extract features
    features, categories = extract_features(cleaned_df)

    if features is not None:
        # Success Message
        print("\n✅ Feature extraction completed successfully!")

        # Optional: Show feature shapes
        print("\nFeature Shapes:")
        print(f"- Rating features: {features.shape}")
        if not categories.empty:
            print(f"- Category features: {categories.shape}")
    else:
        print("❌ Feature extraction failed - no features could be extracted")

except NameError as ne:
    print(f"❌ Error: {str(ne)}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {str(e)}")


📊 Feature Extraction Report:
- Extracted features: rating (normalized)
- Final feature matrix shape: (886, 1)
- Additional category features added

✅ Feature extraction completed successfully!

Feature Shapes:
- Rating features: (886, 1)
- Category features: (886, 94)


# 5. Model Training

In [19]:
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def train_models(df, features, categories):
    try:
        # Split data into train and test
        train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
        # 1. Content-Based Filtering (using rating features)
        print("\n🏗️ Training Content-Based Filtering Model...")
        content_sim = cosine_similarity(features)
        print("✅ Content-Based model trained successfully!")

        # 2. Collaborative Filtering (User-Item interactions)
        print("\n🏗️ Training Collaborative Filtering Model...")

        # Create user-item matrix (simulated)
        if 'category' in df.columns and 'rating' in df.columns:
            # Check if 'user_id' exists, if not create a dummy user_id column
            if 'user_id' not in df.columns:
                print("⚠️ Warning: No user_id column found - using index as user_id")
                df['user_id'] = df.index

            # Use user_id instead of id for the index in pivot_table
            user_item_matrix = df.pivot_table(
                index='user_id',  # Changed from 'id' to 'user_id'
                columns='category',
                values='rating',
                aggfunc='mean',
                fill_value=0
            )
            collab_sim = cosine_similarity(user_item_matrix)
            print("✅ Collaborative Filtering model trained successfully!")
        else:
            print("⚠️ Warning: Required columns not found - using content similarity only")
            collab_sim = content_sim.copy()
        # 3. Hybrid Model (Combining content and collaborative)
        print("\n🏗️ Training Hybrid Model...")
        hybrid_sim = 0.5 * content_sim + 0.5 * collab_sim
        print("✅ Hybrid model trained successfully!")

        # 4. KNN Model for recommendations
        print("\n🏗️ Training K-Nearest Neighbors Model...")
        knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
        knn_model.fit(features)
        print("✅ KNN model trained successfully!")

        # 5. Clustering Model for product grouping
        print("\n🏗️ Training Clustering Model...")
        cluster_model = KMeans(n_clusters=5, random_state=42)
        if not categories.empty:
            cluster_features = np.hstack([features, categories.values])
        else:
            cluster_features = features
        df['cluster'] = cluster_model.fit_predict(cluster_features)
        print("✅ Clustering model trained successfully!")
        return {
            'content_similarity': content_sim,
            'collab_similarity': collab_sim,
            'hybrid_similarity': hybrid_sim,
            'knn_model': knn_model,
            'cluster_model': cluster_model,
            'categories': categories.columns if hasattr(categories, 'columns') else []
        }
    except Exception as e:
        print(f"❌ Error in model training: {str(e)}")
        return None

# Train all models
models = train_models(cleaned_df, features, categories)
if models is not None:
    print("\n🎉 All machine learning models trained successfully!")
else:
    print("\n❌ Model training failed")


🏗️ Training Content-Based Filtering Model...
✅ Content-Based model trained successfully!

🏗️ Training Collaborative Filtering Model...
✅ Collaborative Filtering model trained successfully!

🏗️ Training Hybrid Model...
✅ Hybrid model trained successfully!

🏗️ Training K-Nearest Neighbors Model...
✅ KNN model trained successfully!

🏗️ Training Clustering Model...
✅ Clustering model trained successfully!

🎉 All machine learning models trained successfully!


# 6. Recommendation Functions

In [21]:
def get_recommendations(product_id, model_type='hybrid', top_n=5):
    try:
        # Find the index of the product
        # First check if 'id' exists, otherwise try to find a suitable identifier column
        id_col = None
        if 'id' in cleaned_df.columns:
            id_col = 'id'
        elif 'product_id' in cleaned_df.columns:
            id_col = 'product_id'
        elif 'item_id' in cleaned_df.columns:
            id_col = 'item_id'
        else:
            # Fallback to index-based lookup if no ID column is found
            try:
                # Attempt to use product_id as an index position
                idx = int(product_id)
                if 0 <= idx < len(cleaned_df):
                    # Create a dummy DataFrame with a single row for consistency
                    product_match = cleaned_df.iloc[[idx]]
                    return process_recommendations(idx, product_match, model_type, top_n)
                else:
                    raise ValueError(f"Index {product_id} is out of bounds for dataset")
            except ValueError:
                raise ValueError("No suitable ID column found and product_id is not a valid index")

        # If we found an ID column, use it to find the product
        product_match = cleaned_df[cleaned_df[id_col] == product_id]

        if len(product_match) == 0:
            raise ValueError(f"Product with {id_col} '{product_id}' not found in database")

        idx = product_match.index[0]
        return process_recommendations(idx, product_match, model_type, top_n)

    except Exception as e:
        print(f"❌ Error generating recommendations: {str(e)}")
        return None

def process_recommendations(idx, product_match, model_type, top_n):
    # Get recommendations based on model type
    if model_type == 'content':
        sim_scores = list(enumerate(models['content_similarity'][idx]))
    elif model_type == 'collab':
        sim_scores = list(enumerate(models['collab_similarity'][idx]))
    elif model_type == 'hybrid':
        sim_scores = list(enumerate(models['hybrid_similarity'][idx]))
    elif model_type == 'knn':
        distances, indices = models['knn_model'].kneighbors(features[idx:idx+1])
        recommendations = cleaned_df.iloc[indices[0][1:top_n+1]]
        return display_recommendations(product_match.iloc[0], recommendations, model_type)
    elif model_type == 'cluster':
        cluster = cleaned_df.loc[idx, 'cluster']
        cluster_products = cleaned_df[cleaned_df['cluster'] == cluster]
        if len(cluster_products[cluster_products.index != idx]) > 0:
            recommendations = cluster_products[cluster_products.index != idx].sample(
                min(top_n, len(cluster_products[cluster_products.index != idx]))
            )
            return display_recommendations(product_match.iloc[0], recommendations, model_type)
        else:
            # No other products in the cluster, return some random products
            print("⚠️ No other products in the same cluster. Returning random recommendations.")
            recommendations = cleaned_df[cleaned_df.index != idx].sample(min(top_n, len(cleaned_df)-1))
            return display_recommendations(product_match.iloc[0], recommendations, model_type)
    else:
        raise ValueError("Invalid model type. Choose from 'content', 'collab', 'hybrid', 'knn', or 'cluster'")

    # Sort products by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top N most similar products
    top_indices = [i[0] for i in sim_scores[1:top_n+1]]
    recommendations = cleaned_df.iloc[top_indices]

    return display_recommendations(product_match.iloc[0], recommendations, model_type)

def display_recommendations(query_product, recommendations, model_type):
    # Get product name or a reasonable fallback
    product_name = 'Unknown Product'
    for name_col in ['name', 'title', 'product_name', 'item_name']:
        if name_col in query_product and not pd.isna(query_product[name_col]):
            product_name = query_product[name_col]
            break

    print(f"✨ Recommendations for product: {product_name}")
    print(f"🔍 Model used: {model_type}")
    print("\nTop Recommendations:")

    display_cols = []
    for col in ['id', 'name', 'title', 'category', 'rating', 'price']:
        if col in recommendations.columns:
            display_cols.append(col)

    if len(display_cols) > 0:
        display(recommendations[display_cols])
    else:
        display(recommendations.iloc[:, :min(5, recommendations.shape[1])])

    return recommendations

# Test the recommendation function
try:
    # Get a sample product ID safely
    sample_row = cleaned_df.sample(1).iloc[0]

    # Determine which ID to use
    if 'id' in sample_row:
        sample_product = sample_row['id']
    elif 'product_id' in sample_row:
        sample_product = sample_row['product_id']
    elif 'item_id' in sample_row:
        sample_product = sample_row['item_id']
    else:
        # Use the index as a fallback
        sample_product = sample_row.name

    print(f"\nTesting recommendation system with product ID: {sample_product}")

    print("\nTesting Content-Based Filtering:")
    get_recommendations(sample_product, model_type='content')

    print("\nTesting Collaborative Filtering:")
    get_recommendations(sample_product, model_type='collab')

    print("\nTesting Hybrid Model:")
    get_recommendations(sample_product, model_type='hybrid')

    print("\nTesting KNN Model:")
    get_recommendations(sample_product, model_type='knn')

    print("\nTesting Clustering Model:")
    get_recommendations(sample_product, model_type='cluster')

    print("\n✅ Recommendation function working successfully!")
except Exception as e:
    print(f"❌ Error testing recommendation function: {str(e)}")


Testing recommendation system with product ID: 21497770

Testing Content-Based Filtering:
✨ Recommendations for product: Puma
🔍 Model used: content

Top Recommendations:


Unnamed: 0,title,category,rating
1,Tommy Hilfiger,backpacks,4.5
2,Lavie,backpacks,4.4
3,F Gear,backpacks,4.4
4,MYTRIDENT,bath-robe,4.7
5,H&M,bathroom-accessories,4.5



Testing Collaborative Filtering:
✨ Recommendations for product: Puma
🔍 Model used: collab

Top Recommendations:


Unnamed: 0,title,category,rating
53,Bata,casual-shoes,4.4
54,Prolific,casual-shoes,3.9
55,U.S. Polo Assn.,casual-shoes,4.1
56,HIGHLANDER,casual-shoes,3.4
57,San Frissco,casual-shoes,4.3



Testing Hybrid Model:
✨ Recommendations for product: Puma
🔍 Model used: hybrid

Top Recommendations:


Unnamed: 0,title,category,rating
53,Bata,casual-shoes,4.4
54,Prolific,casual-shoes,3.9
55,U.S. Polo Assn.,casual-shoes,4.1
56,HIGHLANDER,casual-shoes,3.4
57,San Frissco,casual-shoes,4.3



Testing KNN Model:
✨ Recommendations for product: Puma
🔍 Model used: knn

Top Recommendations:


Unnamed: 0,title,category,rating
583,Purple Feather,skirts,4.2
584,Mayra,skirts,3.8
585,Berrylush,skirts,4.3
586,Lancer,sports-shoes,4.0



Testing Clustering Model:
✨ Recommendations for product: Puma
🔍 Model used: cluster

Top Recommendations:


Unnamed: 0,title,category,rating
580,Allen Solly,shorts,4.6
415,U.S. Polo Assn.,lounge-tshirts,4.2
615,U.S. Polo Assn.,sports-shoes,4.0
596,ADIDAS,sports-shoes,4.1
44,Zivame,briefs,4.6



✅ Recommendation function working successfully!


# 7.Export data

In [22]:
import os

def export_trained_data(df, models):
    try:
        # Create directory if it doesn't exist
        save_dir = '/content/drive/MyDrive/AI-Recommendation_Project'
        os.makedirs(save_dir, exist_ok=True)

        # 1. Save cleaned product data
        product_data_path = os.path.join(save_dir, 'cleaned_products.csv')
        df.to_csv(product_data_path, index=False)

        # 2. Save similarity matrices
        pd.DataFrame(models['content_similarity']).to_csv(
            os.path.join(save_dir, 'content_similarity.csv'), index=False)
        pd.DataFrame(models['collab_similarity']).to_csv(
            os.path.join(save_dir, 'collab_similarity.csv'), index=False)
        pd.DataFrame(models['hybrid_similarity']).to_csv(
            os.path.join(save_dir, 'hybrid_similarity.csv'), index=False)

        # 3. Save cluster assignments
        df['cluster'].to_csv(
            os.path.join(save_dir, 'product_clusters.csv'), index=False)

        # Print export report
        print("\n📦 Exported Files:")
        print(f"- Cleaned products: cleaned_products.csv")
        print("- Similarity matrices:")
        print("  - content_similarity.csv")
        print("  - collab_similarity.csv")
        print("  - hybrid_similarity.csv")
        print("- Cluster assignments: product_clusters.csv")

        print("\n✅ All data exported successfully to CSV files!")

    except Exception as e:
        print(f"❌ Error exporting data: {str(e)}")

# Export all data
export_trained_data(cleaned_df, models)


📦 Exported Files:
- Cleaned products: cleaned_products.csv
- Similarity matrices:
  - content_similarity.csv
  - collab_similarity.csv
  - hybrid_similarity.csv
- Cluster assignments: product_clusters.csv

✅ All data exported successfully to CSV files!
