# Import Libraries

In [99]:
import pandas as pd
import numpy as np
import num2words
import re
import contractions
import string

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import words, stopwords
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer

from sklearn.metrics.pairwise import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

from flair.data import *
from flair.nn import *
from flair.embeddings import *
tagger = Classifier.load('sentiment-fast')
glove_embeddings = WordEmbeddings('glove')

import gensim
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec

# Loading Data

In [8]:
fashion_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Recommeder_System/data/cleaned_data.csv')
fashion_df = fashion_df.drop(columns=['Unnamed: 0', 'p_id', 'price', 'img', 'ratingCount', 'avg_rating', 'p_attributes', 'Body Shape ID', 'Character', 'Number of Pockets', 'Multipack Set', 'Technology', 'Hood', 'Pocket'])
print(fashion_df.columns)
print(fashion_df.isna().sum())
# print(fashion_df.dropna(inplace=True))
print(fashion_df.shape)
fashion_df.head()

Index(['name', 'colour', 'brand', 'description', 'Body or Garment Size',
       'Neck', 'Occasion', 'Print or Pattern Type', 'Sleeve Length',
       'Sustainable', 'Wash Care', 'Weave Type', 'Fabric', 'Hemline', 'Length',
       'Pattern', 'Center Front Open', 'Closure', 'Surface Styling', 'Type',
       'Sport'],
      dtype='object')
name                        0
colour                      0
brand                       0
description                 0
Body or Garment Size        0
Neck                      179
Occasion                    0
Print or Pattern Type       8
Sleeve Length               0
Sustainable                 0
Wash Care                   0
Weave Type                  0
Fabric                      0
Hemline                     0
Length                      0
Pattern                     0
Center Front Open           0
Closure                  2091
Surface Styling          4414
Type                      554
Sport                    1818
dtype: int64
(14309, 21)


Unnamed: 0,name,colour,brand,description,Body or Garment Size,Neck,Occasion,Print or Pattern Type,Sleeve Length,Sustainable,...,Weave Type,Fabric,Hemline,Length,Pattern,Center Front Open,Closure,Surface Styling,Type,Sport
0,Khushal K Women Black Ethnic Motifs Printed Ku...,Black,Khushal K,Black printed Kurta with Palazzos with dupatta...,Garment Measurements in,Mandarin Collar,Festive,Ethnic Motifs,Three-Quarter Sleeves,Regular,...,Machine Weave,0,0,0,0,0,0,0,0,0
1,InWeave Women Orange Solid Kurta with Palazzos...,Orange,InWeave,Orange solid Kurta with Palazzos with dupattaK...,Garment Measurements in,Square Neck,Fusion,Solid,Sleeveless,Regular,...,Machine Weave,0,0,0,0,0,0,0,0,0
2,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,Navy Blue,Anubhutee,Navy blue embroidered Kurta with Trousers with...,Garment Measurements in,Round Neck,Daily,Ethnic Motifs,Three-Quarter Sleeves,Regular,...,Machine Weave,0,0,0,0,0,0,0,0,0
3,Nayo Women Red Floral Printed Kurta With Trous...,Red,Nayo,Red printed kurta with trouser and dupattaKurt...,Garment Measurements in,Round Neck,Daily,Ethnic Motifs,Three-Quarter Sleeves,Regular,...,Machine Weave,0,0,0,0,0,0,0,0,0
4,AHIKA Women Black & Green Printed Straight Kurta,Black,AHIKA,"Black and green printed straight kurta, has a ...",Garment Measurements in,Round Neck,Daily,Ethnic Motifs,Three-Quarter Sleeves,0,...,Machine Weave,Cotton,Straight,Calf Length,Printed,0,0,0,0,0


# Text PreProcessing

In [36]:
def contractions_handling(text):
    text = contractions.fix(text)
    return text

def replacing_abbr(text, dictry):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    x = [dictry[word] if word in dictry.keys() else word for word in tokens]
    text = ' '.join(x)
    return text

def removing_punctuations(text):
    # text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('-', ' ').replace(':', ' ')
    text = re.sub(r'[^\w\s]', ' ', text)
    text = text.lower()
    return text

def removing_stopwords(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    x = [word for word in tokens if word not in stopwords.words('english')]
    text = ' '.join(x)
    return text

def convert_num_words(text):
    re_results = re.findall('(\d+(st|nd|rd|th))', text)
    for entire_result, suffix in re_results:
        num = int(entire_result[:-len(suffix)])
        entire_result = " " + entire_result
        num_word = num2words.num2words(num, ordinal=True)
        text = text.replace(entire_result, " " + num_word)
    
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [num2words.num2words(word) if word.isdigit() else word for word in tokens]
    text = ' '.join(tokens)
    
    return text

def removing_char_less_3(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [word for word in tokens if len(word) >= 3]
    text = ' '.join(tokens)
    return text

def preprocess_text(text):
    text = convert_num_words(text)
    text = contractions_handling(text)
    # text = replacing_abbr(text, abbr_dict)
    text = removing_punctuations(text)
    text = removing_stopwords(text)
    text = removing_char_less_3(text)
    return text

In [37]:
fashion_df['processed_description'] = fashion_df['description'].apply(lambda x: preprocess_text(x))

# Generating Embeddings

In [38]:
recommend_df = fashion_df[['name', 'processed_description']].copy()
print(recommend_df.shape)
print(recommend_df.isna().sum())
recommend_df.head()

(14309, 2)
name                     0
processed_description    0
dtype: int64


Unnamed: 0,name,processed_description
0,Khushal K Women Black Ethnic Motifs Printed Ku...,black printed kurta palazzos dupatta kurta des...
1,InWeave Women Orange Solid Kurta with Palazzos...,orange solid kurta palazzos dupattakurta desig...
2,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,navy blue embroidered kurta trousers dupatta k...
3,Nayo Women Red Floral Printed Kurta With Trous...,red printed kurta trouser dupattakurta design ...
4,AHIKA Women Black & Green Printed Straight Kurta,black green printed straight kurta nitched rou...


In [84]:
tfidf_obj = TfidfVectorizer()
description_tfidf = tfidf_obj.fit_transform(recommend_df.processed_description)
description_tfidf.shape

(14309, 10762)

In [76]:
cosine_sim = cosine_similarity(description_tfidf, description_tfidf)
print(type(cosine_sim))

<class 'numpy.ndarray'>


In [48]:
indices = pd.Series(recommend_df.name)

In [117]:
import scipy.sparse as sp

def recommend(title, recommend=recommend_df, df_tfidf=description_tfidf, tfidf_obj=tfidf_obj):
    
    title_series = pd.Series([title])
    indices = pd.Series(recommend['name'])
    indices = indices._append(title_series)
    
    
    title_embeddings = tfidf_obj.transform([title])
    title_embeddings = title_embeddings.toarray()
    df_tfidf = df_tfidf.toarray()
    
    new_description_tfidf = np.vstack((df_tfidf, title_embeddings))
    print(new_description_tfidf.shape)
    cosine_sim = cosine_similarity(new_description_tfidf, new_description_tfidf)
    
    recommended_movies = []
    idx = len(indices)-1 
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)   
    top_5_indices = list(score_series.iloc[1:6].index)  
    print(top_5_indices)
    print(score_series.iloc[1:6])
    for i in top_5_indices:   # to append the titles of top 10 similar movies to the recommended_movies list
        recommended_movies.append(list(recommend['name'])[i])
        
        
    return recommended_movies
    
    

print(recommend('yellow kurta for diwali'))


(14310, 10762)
[7324, 627, 9281, 9344, 9623]
7324    0.513636
627     0.456661
9281    0.454681
9344    0.452017
9623    0.450200
dtype: float64
['Soch Women Yellow Georgette Solid Round Neck Suit Set', 'Bhama Couture Women Mustard Yellow Yoke Design Kurta with Palazzos', 'Soch Yellow & Black Embroidered Art Silk Unstitched Dress Material', 'INDIAN HERITAGE Yellow & Red Printed Silk Crepe Unstitched Dress Material', 'Stylee LIFESTYLE Yellow & Pink Pure Silk Unstitched Dress Material']
