In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Correlated data**
- Pearson correlation measures linear correlation of features
- Value btw -1 and 1
- Value of 0 means no linear correlatioin

In [None]:
# using grains dataset make a scatter plot of width vs length and measure their Pearson correlation.

grains_df = pd.read_csv('../input/seeds-grains-data-set/grains_2.csv')
grains_df.head()

In [None]:
grains_df = grains_df.drop('Class', axis=1)

grains_df2 = grains_df[['width','length']]


In [None]:
# Perform the necessary imports 
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# assign columns
width = grains_df['width']
length = grains_df['length']

# Scatter plot width vs length
plt.figure(figsize=(6,6))
plt.scatter(width, length)
plt.axis('equal')
plt.show()

# Calculate the Pearson correlation
correlation, pvalue = pearsonr(width, length)

# Display the correlation
print(f'Pearson correlation: {correlation}')

**Decorrelating the grain measurements with PCA**

In [None]:
# use PCA to decorrelate width and length measurements, then plot the decorrelated points and measure their Pearson correlation

# Import PCA
from sklearn.decomposition import PCA

# Create PCA instance: model
model = PCA()

# Apply the fit_transform method of model to grains: pca_features
pca_features = model.fit_transform(grains_df)

# Assign 3rd column of pca_features: xs
xs = pca_features[:,3]

# Assign 4th column of pca_features: ys
ys = pca_features[:,4]

# Scatter plot xs vs ys
plt.scatter(xs, ys)
plt.axis('equal')
plt.show()

# Calculate the Pearson correlation of xs and ys
correlation, pvalue = pearsonr(xs, ys)

# Display the correlation
print(correlation)


**use PCA to find the first principal component of the length and width measurements of the grain samples, and represent it as an arrow on the scatter plot**

In [None]:
# Make a scatter plot of the untransformed points
plt.scatter(width,length)

# Create a PCA instance: model
model = PCA()

# Fit model to points
model.fit(grains_df2)

# Get the mean of the grain samples: mean
mean = model.mean_

# Get the first principal component: first_pc
first_pc = model.components_[0,:]

# Plot first_pc as an arrow, starting at mean
plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)

# Keep axes on same scale
plt.axis('equal')
plt.show()

**Variance of the PCA features**
- The fish dataset is 6-dimensional. But what is its intrinsic dimension? 
- Make a plot of the variances of the PCA features to find out. 
- You'll need to standardize the features first.

In [None]:
fish_df = pd.read_csv('../input/fish-measurements-dataset/fish_measurements.csv')

In [None]:
samples = fish_df.drop('species', axis =1)

In [None]:
# Perform the necessary imports
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# Create scaler: scaler
scaler = StandardScaler()

# Create a PCA instance: pca
pca = PCA()

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, pca)

# Fit the pipeline '
pipeline.fit(samples)

# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()

It looks like PCA features 0 and 1 have significant variance, so the intrinsic dimension of the dataset appears to be 2

In [None]:
# Create a PCA model with 2 components identified above
pca = PCA(n_components=2)

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, pca)

# Fit the pipeline '
pipeline.fit(samples)

# Transform the scaled samples: pca_features
pca_features = pipeline.transform(samples)

# Print the shape of pca_features
print(pca_features.shape)

now have reduced the dimensionality from 6 to 2

**tf-idf word-frequency array**
- ***TfidfVectorize*** transforms a list of documents into a word frequency array, which it outputs as a csr_matrix
- csr_matrix is a sparse array (remembers only the non-zero entries (saves space!))
- scikit-learn PCA doesn't support csr_matrix, use scikit-learn TruncatedSVD instead (Performs same transformation)


In [None]:
# create list of documents
documents = ['cats say meow', 'dogs say woof', 'dogs chase cats']

# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer: tfidf
tfidf = TfidfVectorizer() 

# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(documents)

# Print result of toarray() method
print(csr_mat.toarray())

# Get the words: words
words = tfidf.get_feature_names_out()

# Print words
print(words)

**Clustering Wikipedia**
- TruncatedSVD is able to perform PCA on sparse arrays in csr_matrix format, such as word-frequency arrays
- using TruncatedSVD and k-means to cluster some popular pages from Wikipedia



In [None]:
df = pd.read_csv('../input/wikipedia-articles-vector-file/wikipedia-vectors.csv')

In [None]:
articles = df.drop('Unnamed: 0', axis =1)
articles.head()

In [None]:
titles = articles.columns.values.tolist()

In [None]:
# Perform the necessary imports
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

# Create a TruncatedSVD instance: svd
svd = TruncatedSVD(n_components=50)

# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters=6)

# Create a pipeline: pipeline
pipeline = make_pipeline(svd, kmeans)

In [None]:
articles.shape

In [None]:
# transpose dataframe
articles = articles.T

In [None]:
# Fit the pipeline to articles
pipeline.fit(articles)

# Calculate the cluster labels: labels
labels = pipeline.predict(articles)

# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({'label': labels, 'article': titles})

# Display df sorted by cluster label
print(df.sort_values('label'))

Take a look at the cluster labels and see if you can identify any patterns