In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Discovering interpretable features

# ***Non - negative matrix factorization (NMF)***
- dimension reduction technique
- NMF models are interpretable (unlike PCA)
- All sample fratures must be non-negative(>=0)
- NMF expresses documents as combinations of topics (or 'themes')
- NMF expresses images as combinations of patterns
- works with numpy arrays and csr_matrix

**NMF applied to Wikipedia articles**
****

In [None]:
# apply NMF using the tf-idf word-frequency array of Wikipedia articles, given as a csr matrix 

In [None]:
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv('../input/wikipedia-articles-vector-file/wikipedia-vectors.csv')
articles = df.drop('Unnamed: 0', axis = 1)
# transpose dataframe
# articles = articles.T
articles.head()

In [None]:
titles = articles.columns.values.tolist()

In [None]:
articles=articles.T
articles.head()

**NMF features of the wikipedia articles**
****

In [None]:
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
model.fit(articles)

# Transform the articles: nmf_features
nmf_features = model.transform(articles)

# Print the NMF features
# print(nmf_features.round(2))

In [None]:
# explore the NMF features you created 

# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features, index=titles)

print(df.head())
print('\n')

# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])
print('\n')

# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])


When investigating the features, notice that for both actors, the NMF feature 3 has by far the highest value. This means that both articles are reconstructed using mainly the 3rd NMF component

**NMF learns topics of documents**
****

In [None]:
#  identify the topic of the corresponding NMF component

# read words list from .txt file and convert to a python list
words = pd.read_csv('../input/wikipediaarticleswords/wikipedia-vocabulary-utf8.txt', sep=" ", header=None)
words = words[0].values.tolist()

In [None]:
# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=words)

# Print the shape of the DataFrame
print(components_df.shape)
print('\n')

# Select row 3: component
component = components_df.iloc[3]

# Print result of nlargest
print(component.nlargest()) # This gives the five words with the highest values for that component.

Take a moment to recognise the topics that the articles about Anne Hathaway and Denzel Washington have in common

**Explore the LED digits dataset**
****

In [None]:
# Explore the LED digits dataset
# use NMF to decompose grayscale images into their commonly occurring patterns.
# Firstly, explore the image dataset and see how it is encoded as an array.
# You are given 100 images as a 2D array, where each row represents a single 13x8 image

In [None]:
# load dataset and convert to np array
df = pd.read_csv('../input/lcd-digits/lcd_digits.csv')

samples = df.to_numpy()


In [None]:
# Select the 0th row: digit
digit = samples[0,:]

# Print digit
print('digit encoding:')
print(digit)
print('\n')

# Reshape digit to a 13x8 array: bitmap
bitmap = digit.reshape(13,8)

# Print bitmap
print('bitmap encoding:')
print(bitmap)
print('\n')

# Use plt.imshow to display bitmap
plt.imshow(bitmap, cmap='gray', interpolation='nearest')
plt.colorbar()
plt.show()

**NMF learns the parts of images**
****

In [None]:
# displays the image encoded by any 1D array:
def show_as_image(sample):
    bitmap = sample.reshape((13, 8))
    plt.figure()
    plt.imshow(bitmap, cmap='gray', interpolation='nearest')
    plt.colorbar()
    plt.show()

In [None]:
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF model: model
model = NMF(n_components=7) # (7 is the number of cells in an LED display).

# Apply fit_transform to samples: features
features = model.fit_transform(samples)

# Call show_as_image on each component
for component in model.components_:
    show_as_image(component)

# Select the 0th row of features: digit_features
digit_features = features[0,:]

# Print digit_features
print(digit_features)

take a moment to look through the plots and notice how NMF has expressed the digit as a sum of the components!

# ***Building recommender systems using NMF***

**Finding similar articles**
- Engineer at a large online newspaper
- Task: recommend articles similar to article being read by customer
- Similar articles should have similar topics

Strategy: 
- Apply NMF to the word-frequency array 
- NMF feature values describe the topics.
- ..so similar documents have similar NMF feature values
- Compare NMF feature values?




In [None]:
# use NMF features and the cosine similarity to find similar articles. 
# Apply this to your NMF model for popular Wikipedia articles, by finding the articles most similar to the article about the footballer Cristiano Ronald

# Perform the necessary imports
from sklearn.preprocessing import normalize

# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)

# Create a DataFrame: df
df = pd.DataFrame(norm_features, index = titles)

# Select the row corresponding to 'Cristiano Ronaldo': article
article = df.loc['Cristiano Ronaldo']

# Compute the dot products: similarities
similarities = df.dot(article)

# Display those with the largest cosine similarity
print(similarities.nlargest())

**Recommend musical artists**
- build a pipeline and transform the array into normalized NMF features
- MaxAbsScaler, transforms the data so that all users have the same influence on the model, regardless of how many different artists they've listened to
- use the resulting normalized NMF features for recommendation

You are given a sparse array artists whose rows correspond to artists and whose columns correspond to users. The entries give the number of times each artist was listened to by each user.

In [None]:
df = pd.read_csv('../input/music-dataset/Musical artists/scrobbler-small-sample.csv')
df.head()

In [None]:
df.describe()

In [None]:
# create pivot table. index= artists, cols = users, entries = playcount, NaN values = 0

table = pd.pivot_table(df, values= ['playcount'], columns=['user_offset'], index=['artist_offset'], fill_value=0)

In [None]:
table.describe()

In [None]:
# convert dataframe to sparse array (csr_matrix) for use in NMF
from scipy.sparse import csr_matrix

artist_csr_matrix = csr_matrix(table.values)

print(artist_csr_matrix)

In [None]:
# read artist names from file and convert to a python list
artist_names = pd.read_csv('../input/music-dataset/Musical artists/artists.csv', header=None)
artist_names = artist_names[0].values.tolist()
artist_names

In [None]:
# compute the normalized NMF features

# Perform the necessary imports
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

# Create a MaxAbsScaler: scaler
scaler = MaxAbsScaler()

# Create an NMF model: nmf
nmf = NMF(n_components=20)

# Create a Normalizer: normalizer
normalizer = Normalizer()

# Create a pipeline: pipeline
pipeline = make_pipeline(scaler, nmf, normalizer)

# Apply fit_transform to artists: norm_features
norm_features = pipeline.fit_transform(artist_csr_matrix)

**Versions of articles**
- Different versions of the same document have same topic proportions
- ...exact feature values may be different!
- E.g.because one version uses many meaningless words 
- But all versions lie on the same line through the origin

**Cosine similarity**
- Uses the angle between the lines
- Higher values means more similar 
- Maximum value is 1, when angle is 0 degrees


In [None]:
# Use your NMF features from the previous cell and the cosine similarity to find similar musical artists

# Create a DataFrame: df
df = pd.DataFrame(norm_features, index=artist_names)

# Select row of 'Bruce Springsteen': artist
artist = df.loc['Bruce Springsteen']

# Compute cosine similarities: similarities
similarities = df.dot(artist)

# Display those with highest cosine similarity
print(similarities.nlargest())