In [None]:
from __future__ import print_function, division
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import json
from pandas.io.json import json_normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline

## Read in preprocessed data

In [None]:
X_train = pd.read_csv('train_dataset.csv', sep = '\t', index_col = 0).drop('cuisine', axis = 1)
cuisines = pd.read_csv('train_dataset.csv', sep = '\t', usecols = ['cuisine'])
cuisines.set_index(X_train.index, inplace = True)

## PCA

In [None]:
np.unique(cuisines)

In [None]:
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'navy', 'turquoise', 'darkorange', 'black', 'coral', 'fuchsia', 'lightblue', 'grey', 'lavender', 'plum', 'tan', 'teal', 'violet', 'salmon', 'darkgreen']

In [None]:
pca = PCA(n_components = 5)
X_pca = pca.fit_transform(X_train)

fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)

plt.title("Recipes by origin")
for i, cur_origin in enumerate(np.unique(cuisines)):
    plt.scatter(X_pca[(cuisines == cur_origin)['cuisine'], 1], X_pca[(cuisines == cur_origin)['cuisine'], 2], c = colors[i])
    
plt.show()

## t-SNE

In [None]:
tsne = TSNE(n_components = 2)
X_tsne = tsne.fit_transform(X_train)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)

plt.title("Recipes by origin")
for i, cur_origin in enumerate(np.unique(cuisines)):
    plt.scatter(X_tsne[(cuisines == cur_origin)['cuisine'], 0], X_tsne[(cuisines == cur_origin)['cuisine'], 1], c = colors[i])
    
plt.show()

In [None]:
tsne2 = TSNE(n_components = 2, early_exaggeration = 30)
X_tsne2 = tsne.fit_transform(X_train)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)

plt.title("Recipes by origin")
for i, cur_origin in enumerate(np.unique(cuisines)):
    plt.scatter(X_tsne2[(cuisines == cur_origin)['cuisine'], 0], X_tsne2[(cuisines == cur_origin)['cuisine'], 1], c = colors[i])
    
plt.show()

In [None]:
tsne3 = TSNE(n_components = 2, early_exaggeration = 30, perplexity = 50)
X_tsne3 = tsne.fit_transform(X_train)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)

plt.title("Recipes by origin")
for i, cur_origin in enumerate(np.unique(cuisines)):
    plt.scatter(X_tsne3[(cuisines == cur_origin)['cuisine'], 0], X_tsne3[(cuisines == cur_origin)['cuisine'], 1], c = colors[i])
    
plt.show()

In [None]:
tsne4 = TSNE(n_components = 2, perplexity = 50)
X_tsne4 = tsne.fit_transform(X_train)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)

plt.title("Recipes by origin")
for i, cur_origin in enumerate(np.unique(cuisines)):
    plt.scatter(X_tsne4[(cuisines == cur_origin)['cuisine'], 0], X_tsne4[(cuisines == cur_origin)['cuisine'], 1], c = colors[i])
    
plt.show()

In [None]:
tsne5 = TSNE(n_components = 2, perplexity = 15, early_exaggeration=ly_exaggeration = 30)
X_tsne5 = tsne.fit_transform(X_train)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)

plt.title("Recipes by origin")
for i, cur_origin in enumerate(np.unique(cuisines)):
    plt.scatter(X_tsne5[(cuisines == cur_origin)['cuisine'], 0], X_tsne5[(cuisines == cur_origin)['cuisine'], 1], c = colors[i])
    
plt.show()

## Original code

In [None]:
X = pd.read_csv('train_dataset.csv', sep = '\t', index_col = 0).drop('cuisine', axis = 1)
cuisines = pd.read_csv('train_dataset.csv', sep = '\t', usecols = ['cuisine'])
cuisines.set_index(X.index, inplace = True)

X['cuisine'] = np.array(cuisines)

In [None]:
X.head()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(9, 7)

cuisine_counts = X.groupby('cuisine').cuisine.count()
cuisine_counts = pd.DataFrame({'cuisine': cuisine_counts.index, 'count': cuisine_counts.values})
cuisine_counts = cuisine_counts.sort_values(['count'], ascending=False)
sns.set(font_scale=2)
sns.set_style("whitegrid")

plt.title("Cuisine popularity distribution")

_ = sns.barplot(ax=ax, x='count', y='cuisine', data=cuisine_counts, orient='h', label='big', color='lightseagreen')
plt.tight_layout()
#fig.savefig('cuisine_popularity.pdf')

In [None]:
#6867 ingredients in total
X["number_of_ingredients"] = X.iloc[:, :-2].sum(axis=1)

In [None]:
X.shape # 37340 total examples (including unknown)

In [None]:
# number of ingredients per recipe

fig, ax = plt.subplots()
fig.set_size_inches(9, 7)

plt.title("Recipe length distribution")
_ = sns.distplot(X['number_of_ingredients'], kde=True, label="big")
fig.savefig('recipes_length.pdf')

In [None]:
pca   = PCA(n_components=2)
X_pca = pca.fit_transform(X.iloc[:, :-3])

In [None]:
# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))

In [None]:
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'navy', 
          'turquoise', 'darkorange', 'black', 'coral', 'fuchsia', 'lightblue', 
          'grey', 'lavender', 'plum', 'tan', 'teal', 'violet', 'salmon', 'darkgreen']

In [None]:
cuisines = X['cuisine'].values
print(len(set(cuisines)))
print(len(colors))

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)

plt.title("First 2 PCA components")
for i, cur_cuisine in enumerate(set(cuisines)):
    plt.scatter(X_pca[cuisines == cur_cuisine, 0], X_pca[cuisines == cur_cuisine, 1], c=colors[i])

fig.savefig('pca.pdf')

In [None]:
# group by location
print(set(cuisines))

In [None]:
west = ['french', 'irish', 'british', 'russian', 'cajun_creole'] # Europe and US
east = ['chinese', 'vietnamese', 'japanese', 'filipino', 'korean', 'thai']
south = ['southern_us', 'jamaican', 'mexican', 'brazilian']
mediterranean = ['greek', 'spanish', 'italian', 'moroccan']
indian = ['indian']

def label_origin(row):
    if row['cuisine'] in west:
        return 'west'
    if row['cuisine'] in east:
        return 'east'
    if row['cuisine'] in mediterranean:
        return 'mediterranean'
    if row['cuisine'] in indian:
        return 'indian'
    if row['cuisine'] in south:
        return 'south'

X['origin'] = X.apply (lambda row: label_origin(row),axis=1)

    
colors = ['blue', 'green', 'red', 'yellow', 'pink']
origins = X['origin'].values

# smarter method...

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.iloc[:, :-3])

In [None]:
print(set(origins))

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)

plt.title("Recipes by origin")
for i, cur_origin in enumerate(set(origins)):
    plt.scatter(X_pca[origins == cur_origin, 0], X_pca[origins == cur_origin, 1], c=colors[i])

fig.savefig('pca_origins.pdf')

interpretation: 2 first PCA components aren't enough to represent the data (explained variance ratio (first two components): [0.0298653  0.02821523])

# PCA on the transposed matrix, labels instead of dots (sort the ingredients, take the most common ones)
# tSNE
# tf-idf

In [None]:
transposed = X.transpose().iloc[:-3, :]

In [None]:
transposed['count'] = transposed.sum(axis=1) # number of ingredients

In [None]:
transposed['name'] = transposed.index.values

In [None]:
transposed.head()

In [None]:
transposed = transposed.sort_values(['count'], ascending=False).iloc[:100]
transposed.head()

In [None]:
ingr_count = X.sum(axis=0)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)


transposed = transposed.sort_values(['count'], ascending=False).iloc[:100]
sns.set(font_scale=2)
sns.set_style("whitegrid")

plt.title("Ingredients popularity distribution")

_ = sns.barplot(ax=ax, x='count', y='name', data=transposed, orient='h', label='big', color='lightseagreen')
plt.tight_layout()
fig.savefig('ingredient_popularity.pdf')

In [None]:
ingredients = transposed.index.values
ingredients

In [None]:
test = transposed[:100]
test.head()

In [None]:
#pca = PCA(n_components=2)
#X_pca = pca.fit_transform(transposed[:100])

In [None]:
df = pd.DataFrame(X_pca, index=ingredients[:100], columns=['x', 'y'])

In [None]:
df

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)

ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
    ax.annotate(word, pos)
    
fig.savefig("PCA.pdf")   

In [None]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(transposed)

In [None]:
X_tsne

In [None]:
X_pca

In [None]:
X_train = pd.read_csv('train_dataset.csv', sep='\t')
X_train = X_train.drop('Unnamed: 0', axis=1)
ingr_count = X_train.sum(axis=0)

In [None]:
X_train = X_train.transpose()

In [None]:
X_train

In [None]:
X_train['count'] = ingr_count.values

In [None]:
X_train = X_train.sort_values(by='count', ascending=False)

In [None]:
X_train

In [None]:
X_train.iloc[:, :-1]

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train.iloc[:, :-1])

In [None]:
names = X_train.index.values

In [None]:
df = pd.DataFrame(X_pca, index=names, columns=['x', 'y'])

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(31.7, 28.27)
sns.set(font_scale=2)
ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
    ax.annotate(word, pos)
    
plt.title("Dimensionality reduction of recipe ocurrences per ingredient using PCA") 
plt.tight_layout()
fig.savefig("PCA.pdf")   

"|  It is highly recommended to use another dimensionality reduction
 |  method (e.g. PCA for dense data or TruncatedSVD for sparse data)
 |  to reduce the number of dimensions to a reasonable amount (e.g. 50)
 |  if the number of features is very high.""

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
X = svd.fit_transform(X_train.iloc[:, :-1])  

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

In [None]:
X_tsne

In [None]:
df_tsne = pd.DataFrame(X_pca, index=names, columns=['x', 'y'])

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(31.7, 28.27)
sns.set(font_scale=2)
ax.scatter(df_tsne['x'], df_tsne['y'])
for word, pos in df.iterrows():
    ax.annotate(word, pos)
    
plt.title("Dimensionality reduction of recipe ocurrences per ingredient using Truncated SVD and tSNE")   
plt.tight_layout()
fig.savefig("tSNE.pdf")   

In [None]:
X_train['name'] = X_train.index.values

In [None]:
X_train

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)


sns.set_style("whitegrid")

plt.title("The most popular ingredients")

_ = sns.barplot(ax=ax, x='count', y='name', data=X_train, orient='h', label='big', color='lightseagreen')
plt.tight_layout()
fig.savefig('ingredients_popularity.pdf')