<a href="https://colab.research.google.com/github/Ricardomanuel1/Maestria_Ciencia_de_Datos/blob/main/2.2%20VISUALIZACI%C3%93N/DS_MultidimensionalDataI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dealing with Multidimensional Data I**



**Loading Data from Google Drive**

In [None]:
!pip install -U -q PyDrive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:

downloaded = drive.CreateFile({'id':'1P3_2H_B-lR4zdCU5HRrprhoZ4ByxmqUM'}) # replace the id with id of file you want to access
downloaded.GetContentFile('wine.csv')

In [None]:
import pandas as pd

series = pd.read_csv('wine.csv')
print(series.head())

## ***Processing***

### *Feature Selection*

#### Removing features with low variance

In [None]:
from sklearn.feature_selection import VarianceThreshold

X = [[0, 0, 1],
     [0, 1, 1],
     [1, 0, 1],
     [0, 1, 1],
     [0, 1, 1],
     [0, 1, 1]]

sel = VarianceThreshold(threshold=0.1)
sel.fit_transform(X)

#### Univariate Selection

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
cols =  ['Class', 'Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols',
         'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity',
         'Hue', 'OD280/OD315', 'Proline']
data = pd.read_csv(url, names=cols)

# print(data.shape)

X = data.iloc[:,1:12]  #independent columns
y = data.iloc[:,0]    #target column

#apply SelectKBest class to extract top 5 best features
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(5,'Score'))  #print 5 best features



#### Feature Importance

In [None]:
import pandas as pd
import numpy as np

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
cols =  ['Class', 'Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols',
         'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity',
         'Hue', 'OD280/OD315', 'Proline']
data = pd.read_csv(url, names=cols)

X = data.iloc[:,1:12]  #independent columns
y = data.iloc[:,0]    #target column

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt

model = ExtraTreesClassifier()
model.fit(X,y)
#print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(14).plot.barh()
plt.show()

#### Correlation Matrix with Heatmap

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt


url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
cols =  ['Class', 'Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols',
         'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity',
         'Hue', 'OD280/OD315', 'Proline']
data = pd.read_csv(url, names=cols)

print(data.shape)

X = data.iloc[:,1:12]  #independent columns
y = data.iloc[:,0]    #target column


#get correlations of each features in dataset
corrmat = X.corr(method="pearson")
top_corr_features = corrmat.index
plt.figure(figsize=(15,15))
#plot heat map
g=sns.heatmap(X[top_corr_features].corr(),annot=True,cmap="seismic")

### *Feature Extraction*

#### Principal Component Analysis (PCA)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.decomposition import PCA as sklearnPCA

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
cols =  ['Class', 'Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols',
         'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity',
         'Hue', 'OD280/OD315', 'Proline']
data = pd.read_csv(url, names=cols)

y = data['Class']          # Split off classifications
X = data.loc[:, 'Alcohol':] # Split off features

X_norm = (X - X.min())/(X.max() - X.min()) # normalize

pca = sklearnPCA(n_components=2) #2-dimensional PCA
transformed = pd.DataFrame(pca.fit_transform(X_norm))

plt.scatter(transformed[0], transformed[1], c= y.astype(float), s=50)
# plt.scatter(transformed[0], transformed[1], s=50)

plt.show()


#### Linear Discriminant Analysis (LDA)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
cols =  ['Class', 'Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols',
         'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity',
         'Hue', 'OD280/OD315', 'Proline']

data = pd.read_csv(url, names=cols)

y = data['Class']          # Split off classifications
X = data.loc[:, 'Alcohol':] # Split off features

X_norm = (X - X.min())/(X.max() - X.min()) # normalize

lda = LDA(n_components=2) #2-dimensional LDA
lda_transformed = pd.DataFrame(lda.fit_transform(X_norm, y))

# plt.scatter(lda_transformed[0], lda_transformed[1], s=50)
plt.scatter(lda_transformed[0], lda_transformed[1], c= y.astype(float), s=50)

plt.show()

#### t-distributed Stochastic Neighbor Embedding (tSNE)
https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.manifold import TSNE


url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
cols =  ['Class', 'Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols',
         'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity',
         'Hue', 'OD280/OD315', 'Proline']

data = pd.read_csv(url, names=cols)

y = data['Class']          # Split off classifications
X = data.loc[:, 'Alcohol':] # Split off features

X_norm = (X - X.min())/(X.max() - X.min())

# n_iter= number of iterations
# perplexity = (1 local, bigger global)
X_tsne = TSNE(n_components=2, perplexity=50).fit_transform(X_norm)

plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y.astype(float), s=50)

plt.show()

In [None]:
!pip install umap-learn

#### Uniform Manifold Approximation and Projection (UMAP)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

import umap

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
cols =  ['Class', 'Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols',
         'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity',
         'Hue', 'OD280/OD315', 'Proline']

data = pd.read_csv(url, names=cols)

y = data['Class']          # Split off classifications
X = data.loc[:, 'Alcohol':] # Split off features

X_norm = (X - X.min())/(X.max() - X.min())

u = umap.UMAP(n_neighbors=70, min_dist=0.0, n_components=2, metric='euclidean')
X_umap = u.fit_transform(X_norm)

plt.scatter(X_umap[:,0], X_umap[:,1], c=y.astype(float), s=50)

plt.show()