<a href="https://colab.research.google.com/github/RebecaJmz/DIPLOMADO_CIENCIA_DE_DATOS/blob/main/002_TAREAS/006_Modelos_de_Clasificaci%C3%B3n_y_PCA/020_T_Algoritmos_de_clasificaci%C3%B3n_y_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CLASE 20 - MARTES 13 DE JUNIO DEL 2023

# **ANÁLISIS DE COMPONENTES PRINCIPALES A BASE DE DATOS IRIS**

# **IMPORTAR LIBRERÍAS**

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn import datasets as db
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA

# **CARGAR DATOS DENTRO DEL NOTEBOOK**

In [None]:
iris = db.load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['Species']=pd.Categorical.from_codes(iris.target, categories=iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


**DISTRIBUCIÓN DE LAS PLANTAS DE ACUERDO A SU ESPECIE**

In [None]:
class_counts = df['Species'].value_counts()
class_names = iris.target_names

print("Class Distribution:")
for i, count in enumerate(class_counts):
    class_name = class_names[i]
    percentage = (count / len(df)) * 100
    print(f"{class_name}: {count} ({percentage:.2f}%)")

Class Distribution:
setosa: 50 (33.33%)
versicolor: 50 (33.33%)
virginica: 50 (33.33%)


In [None]:
fig = px.bar(x=class_counts.index, y=class_counts.values, labels={'x': 'Class', 'y': 'Count'}, title='Target Class Distribution', template = 'plotly_white', text = class_counts.values)
fig.show()

**HISTOGRAMAS DE LA DISTRIBUCIÓN DE LAS PLANTAS DE ACUERDO A SUS CARACTERÍSTICAS**

In [None]:
features = iris.feature_names

print("\nFeature Distributions by Class:")
for feature in features:
    fig = px.histogram(df, x=feature, color='Species', barmode='overlay', title=f"{feature} Distribution by Class")
    fig.update_layout(barmode='overlay')
    fig.update_traces(opacity=0.75)
    fig.show()


Feature Distributions by Class:


**MATRIZ DE CORRELACIÓN**

In [None]:
corr_matrix = df.corr()

fig = px.imshow(corr_matrix, color_continuous_scale = 'RdBu', range_color=[-1,1], text_auto = True, aspect = "auto", labels=dict(color="Correlation"), title='Feature Correlation', height = 800)
fig.update(layout_coloraxis_showscale=False)
fig.show()





**DEFINIR MATRIZ DE CONFUSIÓN**

In [None]:
def plot_confusion_matrix(cm, labels):
    fig_cm = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count"),
                       x=labels, y=labels, color_continuous_scale='Viridis', text_auto = True,
                       title="Confusion Matrix")
    fig_cm.update_layout(coloraxis_showscale=False)
    fig_cm.show()

# **ANÁLISIS DE COMPONENTES PRINCIPALES PCA**

In [None]:
X = iris.data                                                                    # Features
y = iris.target                                                                  # Labels

In [None]:
scaler = StandardScaler()                                                        # Normalización de los datos
X_scaled = scaler.fit_transform(X)

# PCA CON 2 COMPONENTES

In [None]:
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)
pca_df_2d = pd.DataFrame(data=X_pca_2d, columns=['PC1', 'PC2'])
pca_df_2d['Species'] = y
pca_df_2d.head()

Unnamed: 0,PC1,PC2,Species
0,-2.264703,0.480027,0
1,-2.080961,-0.674134,0
2,-2.364229,-0.341908,0
3,-2.299384,-0.597395,0
4,-2.389842,0.646835,0


In [None]:
fig_2d = px.scatter(pca_df_2d, x='PC1', y='PC2', color='Species', template = 'plotly_white', title = 'PCA - 2 Components')
fig_2d.show()

In [None]:
explained_variance_2d = pca_2d.explained_variance_ratio_
print("Explained Variance Ratio (2D):", explained_variance_2d)

Explained Variance Ratio (2D): [0.72962445 0.22850762]


# *MODELOS DE CLASIFICACIÓN PCA = 2*

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['Species'], test_size=0.2, random_state=7)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pca = PCA(n_components=2)
X_train_pca_2d = pca.fit_transform(X_train_scaled)
X_test_pca_2d = pca.transform(X_test_scaled)

**REGRESIÓN LOGÍSTICA**

In [None]:
lr_pca_2d = LogisticRegression()
lr_pca_2d.fit(X_train_pca_2d, y_train)
lr_pca_2d_pred = lr_pca_2d.predict(X_test_pca_2d)

In [None]:
lr_pca_2d_accuracy = accuracy_score(y_test, lr_pca_2d_pred)
lr_pca_2d_precision = precision_score(y_test, lr_pca_2d_pred, average ="micro")
lr_pca_2d_recall = recall_score(y_test, lr_pca_2d_pred,average ="micro")
lr_pca_2d_f1 = f1_score(y_test, lr_pca_2d_pred,average ="micro")
lr_pca_2d_report = classification_report(y_test, lr_pca_2d_pred)
print("Logistic Regression PCA 2D Classification Report:")
print(lr_pca_2d_report)

Logistic Regression PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.67      0.83      0.74        12
   virginica       0.75      0.55      0.63        11

    accuracy                           0.77        30
   macro avg       0.81      0.79      0.79        30
weighted avg       0.78      0.77      0.76        30



In [None]:
lr_pca_2d_cm = confusion_matrix(y_test, lr_pca_2d_pred)
plot_confusion_matrix(lr_pca_2d_cm, ['SETOSA','VERSICOLOR','VIRGINICA'])

**KNN**

In [None]:
knn_pca_2d = KNeighborsClassifier()
knn_pca_2d.fit(X_train_pca_2d, y_train)
knn_pca_2d_pred = knn_pca_2d.predict(X_test_pca_2d)

In [None]:
knn_pca_2d_accuracy = accuracy_score(y_test, knn_pca_2d_pred)
knn_pca_2d_precision = precision_score(y_test, knn_pca_2d_pred, average = "micro")
knn_pca_2d_recall = recall_score(y_test, knn_pca_2d_pred, average = "micro")
knn_pca_2d_f1 = f1_score(y_test, knn_pca_2d_pred, average = "micro")
knn_pca_2d_report = classification_report(y_test, knn_pca_2d_pred)
print("KNN PCA 2D Classification Report:")
print(knn_pca_2d_report)

KNN PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.73      0.92      0.81        12
   virginica       0.88      0.64      0.74        11

    accuracy                           0.83        30
   macro avg       0.87      0.85      0.85        30
weighted avg       0.85      0.83      0.83        30



In [None]:
knn_pca_2d_cm = confusion_matrix(y_test, knn_pca_2d_pred)
plot_confusion_matrix(knn_pca_2d_cm, ['SETOSA','VERSICOLOR','VIRGINICA'])

**SVM**

In [None]:
svm_pca_2d = SVC()
svm_pca_2d.fit(X_train_pca_2d, y_train)
svm_pca_2d_pred = svm_pca_2d.predict(X_test_pca_2d)

In [None]:
svm_pca_2d_accuracy = accuracy_score(y_test, svm_pca_2d_pred)
svm_pca_2d_precision = precision_score(y_test, svm_pca_2d_pred, average ="micro")
svm_pca_2d_recall = recall_score(y_test, svm_pca_2d_pred, average ="micro")
svm_pca_2d_f1 = f1_score(y_test, svm_pca_2d_pred, average ="micro")
svm_pca_2d_report = classification_report(y_test, svm_pca_2d_pred)
print("SVM PCA 2D Classification Report:")
print(svm_pca_2d_report)

SVM PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.71      0.83      0.77        12
   virginica       0.78      0.64      0.70        11

    accuracy                           0.80        30
   macro avg       0.83      0.82      0.82        30
weighted avg       0.80      0.80      0.80        30



In [None]:
svm_pca_2d_cm = confusion_matrix(y_test, svm_pca_2d_pred)
plot_confusion_matrix(svm_pca_2d_cm, ['SETOSA', 'VERSICOLOR', 'VIRGINICA'])

**NAIVE BAYES**

In [None]:
nb_pca_2d = GaussianNB()
nb_pca_2d.fit(X_train_pca_2d, y_train)
nb_pca_2d_pred = nb_pca_2d.predict(X_test_pca_2d)

In [None]:
nb_pca_2d_accuracy = accuracy_score(y_test, nb_pca_2d_pred)
nb_pca_2d_precision = precision_score(y_test, nb_pca_2d_pred, average="micro")
nb_pca_2d_recall = recall_score(y_test, nb_pca_2d_pred, average="micro")
nb_pca_2d_f1 = f1_score(y_test, nb_pca_2d_pred, average="micro")
nb_pca_2d_report = classification_report(y_test, nb_pca_2d_pred)
print("Naive Bayes PCA 2D Classification Report:")
print(nb_pca_2d_report)

Naive Bayes PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.67      0.83      0.74        12
   virginica       0.75      0.55      0.63        11

    accuracy                           0.77        30
   macro avg       0.81      0.79      0.79        30
weighted avg       0.78      0.77      0.76        30



In [None]:
nb_pca_2d_cm = confusion_matrix(y_test, nb_pca_2d_pred)
plot_confusion_matrix(nb_pca_2d_cm, ['SETOSA', 'VERSICOLOR','VIRGINICA'])

# PCA CON 3 COMPONENTES

In [None]:
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_scaled)
pca_df_3d = pd.DataFrame(data=X_pca_3d, columns=['PC1', 'PC2', 'PC3'])
pca_df_3d['Species'] = y
pca_df_3d.head()

Unnamed: 0,PC1,PC2,PC3,Species
0,-2.264703,0.480027,-0.127706,0
1,-2.080961,-0.674134,-0.234609,0
2,-2.364229,-0.341908,0.044201,0
3,-2.299384,-0.597395,0.09129,0
4,-2.389842,0.646835,0.015738,0


In [None]:
fig_3d = px.scatter_3d(pca_df_3d, x='PC1', y='PC2', z='PC3', color='Species', template = 'plotly_white', title = 'PCA - 3 Components')
fig_3d.show()

In [None]:
explained_variance_3d = pca_3d.explained_variance_ratio_
print("Explained Variance Ratio (3D):", explained_variance_3d)

Explained Variance Ratio (3D): [0.72962445 0.22850762 0.03668922]


# *MODELOS DE CLASIFICACIÓN PCA = 3*

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['Species'], test_size=0.2, random_state=7)

In [None]:
scaler = StandardScaler()                                                        # Normalización de los datos
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pca = PCA(n_components=3)
X_train_pca_3d = pca.fit_transform(X_train_scaled)
X_test_pca_3d = pca.transform(X_test_scaled)

**REGRESIÓN LOGÍSTICA**

In [None]:
lr_pca_3d = LogisticRegression()
lr_pca_3d.fit(X_train_pca_3d, y_train)
lr_pca_3d_pred = lr_pca_3d.predict(X_test_pca_3d)

In [None]:
lr_pca_3d_accuracy = accuracy_score(y_test, lr_pca_3d_pred)
lr_pca_3d_precision = precision_score(y_test, lr_pca_3d_pred, average ="micro")
lr_pca_3d_recall = recall_score(y_test, lr_pca_3d_pred, average ="micro")
lr_pca_3d_f1 = f1_score(y_test, lr_pca_3d_pred, average ="micro")
lr_pca_3d_report = classification_report(y_test, lr_pca_3d_pred)
print("Logistic Regression PCA 3D Classification Report:")
print(lr_pca_3d_report)

Logistic Regression PCA 3D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.83      0.83      0.83        12
   virginica       0.82      0.82      0.82        11

    accuracy                           0.87        30
   macro avg       0.88      0.88      0.88        30
weighted avg       0.87      0.87      0.87        30



In [None]:
lr_pca_3d_cm = confusion_matrix(y_test, lr_pca_3d_pred)
plot_confusion_matrix(lr_pca_3d_cm, ['SETOSA', 'VERSICOLOR','VIRGINICA'])

**KNN**

In [None]:
knn_pca_3d = KNeighborsClassifier()
knn_pca_3d.fit(X_train_pca_3d, y_train)
knn_pca_3d_pred = knn_pca_3d.predict(X_test_pca_3d)

In [None]:
knn_pca_3d_accuracy = accuracy_score(y_test, knn_pca_3d_pred)
knn_pca_3d_precision = precision_score(y_test, knn_pca_3d_pred, average="micro")
knn_pca_3d_recall = recall_score(y_test, knn_pca_3d_pred, average="micro")
knn_pca_3d_f1 = f1_score(y_test, knn_pca_3d_pred, average="micro")
knn_pca_3d_report = classification_report(y_test, knn_pca_3d_pred)
print("KNN PCA 3D Classification Report:")
print(knn_pca_3d_report)

KNN PCA 3D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.79      0.92      0.85        12
   virginica       0.89      0.73      0.80        11

    accuracy                           0.87        30
   macro avg       0.89      0.88      0.88        30
weighted avg       0.87      0.87      0.87        30



In [None]:
knn_pca_3d_cm = confusion_matrix(y_test, knn_pca_3d_pred)
plot_confusion_matrix(knn_pca_3d_cm, ['SETOSA', 'VERSICOLOR', 'VIRGINICA'])

**SVM**

In [None]:
svm_pca_3d = SVC()
svm_pca_3d.fit(X_train_pca_3d, y_train)
svm_pca_3d_pred = svm_pca_3d.predict(X_test_pca_3d)

In [None]:
svm_pca_3d_accuracy = accuracy_score(y_test, svm_pca_3d_pred)
svm_pca_3d_precision = precision_score(y_test, svm_pca_3d_pred, average ="micro")
svm_pca_3d_recall = recall_score(y_test, svm_pca_3d_pred, average="micro")
svm_pca_3d_f1 = f1_score(y_test, svm_pca_3d_pred, average="micro")
svm_pca_3d_report = classification_report(y_test, svm_pca_3d_pred)
print("SVM PCA 3D Classification Report:")
print(svm_pca_3d_report)

SVM PCA 3D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.83      0.83      0.83        12
   virginica       0.82      0.82      0.82        11

    accuracy                           0.87        30
   macro avg       0.88      0.88      0.88        30
weighted avg       0.87      0.87      0.87        30



In [None]:
svm_pca_3d_cm = confusion_matrix(y_test, svm_pca_3d_pred)
plot_confusion_matrix(svm_pca_3d_cm, ['SETOSA', 'VERSICOLOR', 'VIRGINICA'])

**NAIVE BAYES**

In [None]:
nb_pca_3d = GaussianNB()
nb_pca_3d.fit(X_train_pca_3d, y_train)
nb_pca_3d_pred = nb_pca_3d.predict(X_test_pca_3d)

In [None]:
nb_pca_3d_accuracy = accuracy_score(y_test, nb_pca_3d_pred)
nb_pca_3d_precision = precision_score(y_test, nb_pca_3d_pred, average="micro")
nb_pca_3d_recall = recall_score(y_test, nb_pca_3d_pred, average ="micro")
nb_pca_3d_f1 = f1_score(y_test, nb_pca_3d_pred, average="micro")
nb_pca_3d_report = classification_report(y_test, nb_pca_3d_pred)
print("Naive Bayes PCA 3D Classification Report:")
print(nb_pca_3d_report)

Naive Bayes PCA 3D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.82      0.75      0.78        12
   virginica       0.75      0.82      0.78        11

    accuracy                           0.83        30
   macro avg       0.86      0.86      0.86        30
weighted avg       0.84      0.83      0.83        30



In [None]:
nb_pca_3d_cm = confusion_matrix(y_test, nb_pca_3d_pred)
plot_confusion_matrix(nb_pca_3d_cm, ['SETOSA', 'VERSICOLOR', 'VIRGINICA'])