In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import csv

In [None]:
transaction = pd.read_csv("transactions.csv", delimiter="|")
items = pd.read_csv("items.csv", delimiter="|")

In [None]:
transaction.head()

In [None]:
print(transaction.shape)
print(items.shape)

In [None]:
data = pd.merge(items, transaction, on=["itemID"], how="outer")

In [None]:
data.shape

In [None]:
data.info()

In [None]:
# choosing Target variable
#print(inner_merged_total['basket'].value_counts())
#print(inner_merged_total['click'].value_counts())
print(data['order'].value_counts())

In [None]:
##data preprocessing 
##generate target variable interest
data["Interest"] = data["click"] + data["basket"]+ data["order"]


## aggregrate (idea is to generate the frequencies of item id, and multiply it by interest, getting the total interest of the items)

In [None]:
data.head()
#print(inner_merged_total['Interest'].value_counts())

## Group by item ID, and then count the item ID, and then interest * count of itemID = total interest column

In [None]:
##generate frequency
test = data.value_counts(['itemID']).reset_index(name='Frequency')
print(test.head())

In [None]:
#Merging frequency 
book_data = pd.merge(data, test, on=["itemID"])
book_data.head()

In [None]:
#remove dupicate rows 
print(book_data.shape)
# book_data.drop_duplicates(subset ="itemID", keep='first' ,inplace=True)
# print(book_data.shape)

In [None]:
book_data["itemID"].value_counts()

In [None]:
book_data.head()

In [None]:
book_rating= book_data.copy()
book_rating.drop(['sessionID', 'click','author','publisher'], axis=1)

In [None]:
##data cleaning
values = {'basket': 0, 'order': 0}
book_rating.fillna(value=values)

In [None]:
import numpy as np
book_rating['log_frequency'] = np.log(book_data['Frequency'])

In [None]:
print(book_rating['log_frequency'].value_counts())

In [None]:
book_rating['log_frequency'].describe()

In [None]:
bin_labels_5 = ['E','D','C', 'B', 'A']
book_rating['Grade'] = pd.qcut(book_rating['log_frequency'],
                              q=[0, .2, .4, .6, .8, 1],
                              labels=bin_labels_5, duplicates='drop')
book_rating


In [None]:
book_rating = book_rating.drop(['sessionID', 'click','author','publisher','Frequency'], axis=1)

In [None]:
book_rating.drop_duplicates(subset ="title", keep='first' ,inplace=True)

In [None]:
book_rating = book_rating.reset_index()

In [None]:
book_rating

In [None]:
#creating combined target feature
def parseFeature(book_rating):
    data = str(book_rating["main topic"]) + ','
    data += str(book_rating["subtopics"]) + ','
    data += str(book_rating['relevance_score'])
    x = data.replace('[','')
    y = x.replace(']','')
    return y
    
    

In [None]:
#calculate relevance score

def relevance(book_rating):
    if (book_rating['order'] != 0):
        return "A"
    elif (book_rating['order'] == 0 and  book_rating['basket'] != 0):
        return "B"
    else:
        return "C"
    

book_rating['relevance_score'] = book_rating.apply(relevance, axis = 1)


In [None]:
book_rating['parsedFeatureStr'] = book_rating.apply(parseFeature, axis = 1)

In [None]:
book_rating.head(40)

In [None]:
book_rating.drop_duplicates(subset ="itemID", keep='first' ,inplace=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

c_vectorizer = CountVectorizer(min_df=1,
                            strip_accents='unicode', analyzer='word',token_pattern=r',',ngram_range=(1,3),stop_words='english')

vectorizer = TfidfVectorizer(min_df=3, max_features=None,
                            strip_accents='unicode', analyzer='word',token_pattern=r',',ngram_range=(1,3),stop_words='english')

h_vectorizer = HashingVectorizer(n_features=2**1,
                            strip_accents='unicode', analyzer='word',token_pattern=r',',ngram_range=(1,3),stop_words='english')

In [None]:
parsedFeatureMatrix = vectorizer.fit_transform(book_rating['parsedFeatureStr'])

In [None]:
parsedFeatureMatrix

In [None]:
parsedFeatureMatrix.shape

In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel 
from sklearn.metrics.pairwise import cosine_similarity 

sig = sigmoid_kernel(parsedFeatureMatrix, parsedFeatureMatrix)
# cos = cosine_similarity(parsedFeatureMatrix, parsedFeatureMatrix)

In [None]:
sig[0]

In [None]:
indices = pd.Series(book_rating.index, index = book_rating['title'])

In [None]:
indices_df = pd.Series(book_rating.index, index = book_rating['title']).to_frame()

In [None]:
indices_itemID_mapping = pd.merge(indices_df, book_rating, on=["title"])

In [None]:
indices_df

In [None]:
indices_itemID_mapping = indices_itemID_mapping.drop(['index'], axis=1)

In [None]:
indices_itemID_mapping

In [None]:
def recommender(title, sig=sig):
    index = indices[title]
    print(index)
    
    # get similarity scores
    sig_scores = list(enumerate(sig[index]))
    #sort by scores
    sig_scores = sorted(sig_scores, key=lambda x:x[1], reverse=True)
    sig_scores = sig_scores[1:6]
    
    #relate to book
    book_indices = [i[0] for i in sig_scores]
    
    return book_rating['itemID'].iloc[book_indices]
    
    
    

In [None]:
recommender_df = recommender('Mein großes Schablonen-Buch - Wilde Tiere').to_frame()

In [None]:
recommender_df

In [None]:
t = pd.merge(recommender_df, book_rating, on=["itemID"])

In [None]:
t

## Populate evaluation DF


In [None]:
## Use Evaluation dataset to recommend 5 itemID

evaluation_df = pd.read_csv("evaluation.csv", delimiter="|")

evaluation_df

In [None]:
title_df = indices_itemID_mapping.drop(['basket','order','main topic','subtopics','log_frequency','Grade', 'relevance_score', 0, 'parsedFeatureStr'], axis=1)
title_df

In [None]:
submission_df = pd.merge(evaluation_df, items, on=["itemID"]) 
submission_df = submission_df.drop(['author','publisher','main topic','subtopics'], axis=1)
submission_df

In [None]:
def getSigScoreArray(num, title, sig=sig):
    index = indices[title]
    print(index)
    
    # get similarity scores
    sig_scores = list(enumerate(sig[index]))
    #sort by scores
    sig_scores = sorted(sig_scores, key=lambda x:x[1], reverse=True)
    sig_scores = sig_scores[1:num+1]
    
    test_sig_score_arr = []
    test_sig_score_arr = sig_scores
    
    sig_score_array = []
    for score in test_sig_score_arr:
        sig_score_array.append(score[1])
    return sig_score_array

In [None]:
sig_score_array = getSigScoreArray(5, "Princess Poppy: The Big Mix Up")

In [None]:
##Create new column where order != 0 but basket and/or click is 0

def identify(book_data):
    if (book_data['order'] != 0 and (book_data['basket']  | book_data['click'] == 0)):
        return 'Found'
    else:
        return 'Not found'

book_data['test'] = book_data.apply(identify, axis = 1)

book_data.head(10)

In [None]:
#Print how many counts for order column

print(book_data['order'].value_counts())


In [None]:
## Print how many counts for test column

print(book_data['test'].value_counts())


In [None]:
## Print column where test == found

test_column = book_data.loc[book_data['test'] == 'Found']

test_column.head(10)


In [None]:
##clone dataframe
sea_born_df = book_data.copy(deep=True)

In [None]:
sea_born_df.shape

In [None]:
## dropping unnessesary columns for pairplot
sea_born_df = sea_born_df.filter(items = ['click', 'basket', 'order'])


In [None]:
## Make pair plot using seaborn
import seaborn as sns


# sns.set(style="ticks", color_codes=True)
sns.pairplot(sea_born_df)


import matplotlib.pyplot as plt
plt.show()



In [None]:
#Create histogram -> edited

x = book_data['Total_Interest'].value_counts()

plt.hist(x, bins = 60)
plt.show()

In [None]:
## remove interest, total interest, click, basket, order from dataframe

latest_data = book_data.drop(['Interest', 'Total_Interest', 'click', 'basket', 'order'], axis = 1)
latest_data


In [None]:
## EDA main topic vs subtopic vs title and use only frequency


In [None]:
#test on printing rows with one main topic

latest_data.loc[latest_data['main topic'] == 'FM'].head(5)


In [None]:
# Show title, main topic & subtopics with frequency value in mean
print(latest_data.groupby('title')['Frequency'].mean().sort_values(ascending=False).head(), '\n')
print(latest_data.groupby('main topic')['Frequency'].mean().sort_values(ascending=False).head(), '\n')
print(latest_data.groupby('subtopics')['Frequency'].mean().sort_values(ascending=False).head())


In [None]:
# Show title, main topic & subtopics with frequency value in counts
print(latest_data.groupby('title')['Frequency'].count().sort_values(ascending=False).head(), '\n')
print(latest_data.groupby('main topic')['Frequency'].count().sort_values(ascending=False).head(), '\n')
print(latest_data.groupby('subtopics')['Frequency'].count().sort_values(ascending=False).head())

In [None]:
#Mean value for title
title = pd.DataFrame(latest_data.groupby('title')['Frequency'].mean().sort_values(ascending=False))
#Count value for title
title['Counts'] = pd.DataFrame(latest_data.groupby('title')['Frequency'].count())
title


In [None]:
print(title['Counts'].value_counts(), '\n')    #Counts
print(title['Frequency'].value_counts(), '\n') #Mean 

In [None]:
import seaborn as sns
sns.set_style('white')
%matplotlib inline

In [None]:
plt.figure(figsize=(10,4))
title['Counts'].hist(bins=30)

plt.figure(figsize=(10,4))
title['Frequency'].hist(bins=80)

In [None]:
sns.jointplot(x='Counts', y='Frequency', data=title,  alpha=0.6)

In [None]:
#Mean value for main topic
main = pd.DataFrame(latest_data.groupby('main topic')['Frequency'].mean().sort_values(ascending=False))
#Count value for main topic
main['Counts'] = pd.DataFrame(latest_data.groupby('main topic')['Frequency'].count())
main

In [None]:
#Show latest data
print(latest_data['main topic'].value_counts(), '\n')    #Counts

#Show the counts for duplicate main topic
#You may ignore this ...... Not sure if this is good or not
print('Number of duplicated rows:', latest_data.duplicated(['main topic'], keep='first').value_counts(), '\n')

#Show where total duplicated main topic more than 5
duplicate = latest_data['main topic'].value_counts()
print(duplicate.loc[duplicate > 5])

#Show where total duplicated main topic less than or equal to 5
duplicate.loc[duplicate <= 5]


In [None]:
print(main['Counts'].value_counts(), '\n')    #Counts
print(main['Frequency'].value_counts(), '\n') #Mean 

In [None]:
plt.figure(figsize=(10,4))
main['Counts'].hist(bins=30)

plt.figure(figsize=(10,4))
main['Frequency'].hist(bins=70)

In [None]:
sns.jointplot(x='Counts', y='Frequency', data=main,  alpha=0.6)

In [None]:
#Mean value for subtopic
sub = pd.DataFrame(latest_data.groupby('subtopics')['Frequency'].mean().sort_values(ascending=False))
#Count value for subtopic
sub['Counts'] = pd.DataFrame(latest_data.groupby('subtopics')['Frequency'].count())
sub

In [None]:
print(sub['Counts'].value_counts(), '\n')    #Counts
print(sub['Frequency'].value_counts(), '\n') #Mean 

In [None]:
plt.figure(figsize=(10,4))
sub['Counts'].hist(bins=10)

plt.figure(figsize=(10,4))
sub['Frequency'].hist(bins=70)

In [None]:
sns.jointplot(x='Counts', y='Frequency', data=sub,  alpha=0.6)

## Extra EDA

In [None]:
transaction = pd.read_csv("transactions.csv", delimiter="|")
items = pd.read_csv("items.csv", delimiter="|")

In [None]:
data = pd.merge(transaction, items, on=["itemID"])

In [None]:
data["Interest"] = data["click"] + data["basket"]+ data["order"]

In [None]:
##generate frequency
test = data.value_counts(['itemID']).reset_index(name='Frequency')
print(test.head())

In [None]:
#Merging frequency 
book_data = pd.merge(data, test, on=["itemID"])
book_data.head()

In [None]:
## removing unnnessesarry column
del book_data["click"]
del book_data["basket"]
del book_data["order"]

In [None]:
book_data.head()

In [None]:
import seaborn as sns
sns.set_style('white')
%matplotlib inline

In [None]:
book_data.groupby('title')['Interest'].mean().sort_values(ascending=False).head()

In [None]:
book_data.groupby('title')['Interest'].count().sort_values(ascending=False).head()

In [None]:
# interests = pd.DataFrame(book_data.groupby('title')['Interest'].mean())
# interests = pd.DataFrame(book_data.groupby('title')['click'].mean())
# #interests = pd.DataFrame(book_data.groupby('title')['basket'].mean())
interests = pd.DataFrame(book_data.groupby('title')['Interest'].mean())
interests.head()

In [None]:
# interests['number of interests'] = pd.DataFrame(book_data.groupby('title')['Interest'].count())
# interests['number of click'] = pd.DataFrame(book_data.groupby('title')['click'].count())
# interests['number of basket'] = pd.DataFrame(book_data.groupby('title')['basket'].count())
interests['number of order'] = pd.DataFrame(book_data.groupby('title')['Interest'].count())
interests.head()

In [None]:
plt.figure(figsize=(10,4))
interests['number of interests'].hist(bins=70)

In [None]:
plt.figure(figsize=(10,4))
interests['Interest'].hist(bins=70)

In [None]:
sns.jointplot(x='Interest', y='number of interests', data=interests,  alpha=0.5)

In [None]:
sns.jointplot(x='order', y='number of basket', data=interests,  alpha=0.5)

In [None]:
sns.jointplot(x='basket', y='number of basket', data=interests,  alpha=0.5)

In [None]:
sns.jointplot(x='click', y='number of click', data=interests,  alpha=0.5)

## Pivot Table

In [None]:
book_pivot = book_data_test.pivot_table(index='sessionID', columns='title', values='Interest')
book_pivot.head()

In [None]:
book_data_test = book_data.head(100000)

In [None]:
interests.sort_values('number of interests', ascending=False).head(10)