<a href="https://colab.research.google.com/github/RG2806/Product_Recommendation_Engine/blob/master/Question_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
dataset_path = '/content/drive/My Drive/question_2/ecommerce_sample_dataset.csv'
df=pd.read_csv(dataset_path)
df.head()

In [0]:
df.info()

In [0]:
df.duplicated().value_counts()

In [0]:
df['crawl_timestamp']=pd.to_datetime(df['crawl_timestamp'])
df['crawl_year']=df['crawl_timestamp'].apply(lambda x :x.year)
df['crawl_month']=df['crawl_timestamp'].apply(lambda x :x.month)

In [0]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
print(df.groupby('crawl_month')['crawl_month'].count())
df.groupby('crawl_month')['crawl_month'].count().plot(kind='bar')
plt.title('Sales count by Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.show()

In [0]:
df['MainCategory'] = df['product_category_tree'].apply(lambda x: x.split('>>')[0][2:].strip())
plt.figure(figsize=(12,8))
df.groupby('MainCategory')['MainCategory'].count().sort_values(ascending=False)[:15]
df['MainCategory'].value_counts()[:15].sort_values(ascending=True).plot(kind='barh')

In [0]:
import re
import nltk
from nltk import pos_tag, word_tokenize, PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
wordnet_lemmatizr=WordNetLemmatizer()

In [0]:
from termcolor import colored

In [0]:
def clean_categories(dataframe):
    document=list(dataframe['product_category_tree'].values)
    categories=[re.findall(r'name=(.*?)}',sentence) for sentence in document]
    categories=[' '.join(word) for word in categories]
    return(categories)

In [0]:
def special_characters_cleaning(document):
    sentences=[]
    for sentence in document:
        sentences.append(re.sub('[^a-zA-Z0-9\n\.]',' ',str(sentence)))
    return(sentences)

In [0]:
def clean_product_type(dataframe):
    document=list(dataframe['product_category_tree'])
    product_types=[re.findall(r'\"(.*?)\"', sentence) for sentence in document]
    product_types=[' '.join(listed_items) for listed_items in product_types]
    return(product_types)

In [0]:
def categories_extraction(dataframe):
    categories=[word for item in dataframe['categories'] for word in item.split()]
    categories=list(set(categories))
    return(categories)

In [0]:
def lemmetize_document(document):
    sentences=[]
    for sentence in document:
        word=[wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(sentence)]
        sentences.append(' '.join(words))
    return(sentences)

In [0]:
def save_categories(dataframe):
    pass

In [0]:
def pre_processing_document(document):
    document=special_characters_cleaning(document)
    document=lemmetize_document(document)
    document=[sentence.title() for sentence in document]
    return(document)

In [0]:
def extract_categories_from_description(document,categories):
    extracted_categories=[]
    for sentence in document:
        extracted_categories.append(' '.join(set(categories).intersection(set(word_tokenize(sentence)))))
        return(extracted_categories)

In [0]:
lemmetize= WordNetLemmatizer()
stemmer=PorterStemmer()

In [0]:
df["products"]=clean_product_type(df)
df["categories"]=clean_categories(df)
categories= list(set(df['product_category_tree'].values))
categories= [item.split() for item in df['product_category_tree']]
categories= [word.lower() for listed_item in categories for word in listed_item]
categories= list(set(categories))

df

In [0]:
df['detailed_description']= df['products'].apply(lambda x: x.replace('>>',''))+" "+ df['brand']+" " +df['product_name']
df['detailed_description']=df['detailed_description'].fillna('')
df2=df[['product_name','detailed_description']].copy()
df2.drop_duplicates('product_name', inplace=True)
df2['new_col'] = range(0, len(df2) )
df2
df2.to_csv (r'/content/drive/My Drive/question_2/export_dataframe.csv', index = False, header=True)


In [0]:

cv = CountVectorizer()

count_matrix = cv.fit_transform(df2['detailed_description'])

cosine_sim = cosine_similarity(count_matrix)
def get_title_from_index(index):
    return df2[df2.new_col == index]["product_name"].values[0]
def get_index_from_title(product_name):
    return df2[df2.product_name == product_name]['new_col'].values[0]

movie_user_likes = "Alisha Solid Women's Cycling Shorts"
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))

sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

j=0
print("Top 5 similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    j=j+1
    if j>5:
        break

Top 5 similar movies to Alisha Solid Women's Cycling Shorts are:

Mynte Solid Women's Cycling Shorts, Gym Shorts, Swim Shorts
Ashdan Solid Women's Basic Shorts
Nordlich Printed Women's Night Shorts
Lavennder Printed Women's Basic Shorts
Vero Moda Solid Women's Basic Shorts
Lavennder Floral Print Women's Basic Shorts


In [0]:
document= list(df['detailed_description'].values)
document= special_characters_cleaning(document)

In [0]:
tfidf= TfidfVectorizer(stop_words= 'english', vocabulary= categories)
data= tfidf.fit_transform(document)

In [0]:
from sklearn.neighbors import NearestNeighbors
nn= NearestNeighbors(algorithm= 'brute', n_neighbors= 20).fit(data)

In [0]:
text= df[df['brand']== "FabHomeDecor"]['detailed_description'].values
result = nn.kneighbors(tfidf.transform(text))
for col in tfidf.transform(text).nonzero()[1]:
    print(tfidf.get_feature_names()[col], ' - ', tfidf.transform(text)[0, col])

In [0]:
text

In [0]:
result

In [0]:
for item in result[1][0]:
    print(colored(df.iloc[item]['product_category_tree'].upper(), 'blue'), ':', document[item])

In [0]:
df