
Demo to Raghu

---
MNIST
---

In [None]:
from tensorflow.keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: (60000, 28, 28)
y_train shape: (60000,)
x_test shape: (10000, 28, 28)
y_test shape: (10000,)


In [None]:
from sklearn.preprocessing import StandardScaler

x_train_reshaped = x_train.reshape(x_train.shape[0], -1)
x_test_reshaped = x_test.reshape(x_test.shape[0], -1)

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train_reshaped)
x_test_scaled = scaler.transform(x_test_reshaped)

print(f"x_train_scaled shape: {x_train_scaled.shape}")
print(f"x_test_scaled shape: {x_test_scaled.shape}")

x_train_scaled shape: (60000, 784)
x_test_scaled shape: (10000, 784)


PCA 5

In [3]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
x_train_pca = pca.fit_transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

print("x_train_pca shape:", x_train_pca.shape)
print("x_test_pca shape:", x_test_pca.shape)

x_train_pca shape: (60000, 5)
x_test_pca shape: (10000, 5)


In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(
    penalty='l2',
    C=1.0,         
    solver='lbfgs',
    max_iter=1000, 
    random_state=42
)

In [5]:
model_lr.fit(x_train_pca, y_train)

from sklearn.metrics import accuracy_score, classification_report
y_pred = model_lr.predict(x_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.6800
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.79       980
           1       0.88      0.95      0.91      1135
           2       0.66      0.60      0.63      1032
           3       0.62      0.74      0.67      1010
           4       0.65      0.63      0.64       982
           5       0.47      0.29      0.36       892
           6       0.80      0.83      0.81       958
           7       0.73      0.77      0.75      1028
           8       0.48      0.47      0.48       974
           9       0.59      0.63      0.61      1009

    accuracy                           0.68     10000
   macro avg       0.67      0.67      0.67     10000
weighted avg       0.67      0.68      0.67     10000



PCA 20

In [6]:
pca = PCA(n_components=20)
x_train_pca = pca.fit_transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

print("x_train_pca shape:", x_train_pca.shape)
print("x_test_pca shape:", x_test_pca.shape)

x_train_pca shape: (60000, 20)
x_test_pca shape: (10000, 20)


In [7]:
model_lr.fit(x_train_pca, y_train)

from sklearn.metrics import accuracy_score, classification_report
y_pred = model_lr.predict(x_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8721
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       980
           1       0.95      0.97      0.96      1135
           2       0.89      0.84      0.86      1032
           3       0.84      0.86      0.85      1010
           4       0.87      0.89      0.88       982
           5       0.81      0.79      0.80       892
           6       0.90      0.91      0.91       958
           7       0.89      0.87      0.88      1028
           8       0.82      0.80      0.81       974
           9       0.82      0.84      0.83      1009

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


model = DecisionTreeClassifier(random_state=42, max_depth=10)  

model.fit(x_train_pca, y_train)

y_pred = model.predict(x_test_pca)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8027
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       980
           1       0.96      0.94      0.95      1135
           2       0.80      0.78      0.79      1032
           3       0.78      0.78      0.78      1010
           4       0.73      0.80      0.77       982
           5       0.79      0.68      0.73       892
           6       0.90      0.87      0.89       958
           7       0.87      0.79      0.83      1028
           8       0.60      0.75      0.67       974
           9       0.74      0.74      0.74      1009

    accuracy                           0.80     10000
   macro avg       0.81      0.80      0.80     10000
weighted avg       0.81      0.80      0.80     10000



In [None]:
model = DecisionTreeClassifier(random_state=42, max_depth=30)  

model.fit(x_train_pca, y_train)

y_pred = model.predict(x_test_pca)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8474
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.91       980
           1       0.96      0.97      0.96      1135
           2       0.85      0.82      0.83      1032
           3       0.80      0.83      0.81      1010
           4       0.85      0.84      0.84       982
           5       0.80      0.79      0.80       892
           6       0.91      0.90      0.90       958
           7       0.84      0.83      0.84      1028
           8       0.77      0.77      0.77       974
           9       0.79      0.79      0.79      1009

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [None]:
model = DecisionTreeClassifier(random_state=42, max_depth= 40)   
model.fit(x_train_pca, y_train)

y_pred = model.predict(x_test_pca)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8477
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.91       980
           1       0.96      0.96      0.96      1135
           2       0.87      0.82      0.84      1032
           3       0.79      0.82      0.81      1010
           4       0.84      0.84      0.84       982
           5       0.81      0.80      0.81       892
           6       0.91      0.89      0.90       958
           7       0.84      0.83      0.84      1028
           8       0.76      0.77      0.77       974
           9       0.79      0.79      0.79      1009

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



---
Spam Data
---

In [None]:
from ucimlrepo import fetch_ucirepo 
  
spambase = fetch_ucirepo(id=94) 
  
X = spambase.data.features 
y = spambase.data.targets 
  
print(spambase.metadata) 
  
print(spambase.variables) 

{'uci_id': 94, 'name': 'Spambase', 'repository_url': 'https://archive.ics.uci.edu/dataset/94/spambase', 'data_url': 'https://archive.ics.uci.edu/static/public/94/data.csv', 'abstract': 'Classifying Email as Spam or Non-Spam', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 4601, 'num_features': 57, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1999, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C53G6X', 'creators': ['Mark Hopkins', 'Erik Reeber', 'George Forman', 'Jaap Suermondt'], 'intro_paper': None, 'additional_info': {'summary': 'The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...\n\nThe classification task for this dataset is to determine whether a given email is spam or not.\n\t\nOur collecti

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (3680, 57)
X_test shape: (921, 57)
y_train shape: (3680, 1)
y_test shape: (921, 1)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled_spam = scaler.fit_transform(X_train)
x_test_scaled_spam = scaler.transform(X_test)

print(f"x_train_scaled shape: {x_train_scaled_spam.shape}")
print(f"x_test_scaled shape: {x_test_scaled_spam.shape}")

x_train_scaled shape: (3680, 57)
x_test_scaled shape: (921, 57)


In [17]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
x_train_pca_spam = pca.fit_transform(x_train_scaled_spam)
x_test_pca_spam = pca.transform(x_test_scaled_spam)

print("x_train_pca shape:", x_train_pca_spam.shape)
print("x_test_pca shape:", x_test_pca_spam.shape)

x_train_pca shape: (3680, 5)
x_test_pca shape: (921, 5)


Logistic Regression(with PCA =5)

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(
    penalty='l2',
    C=1.0,          
    solver='lbfgs', 
    max_iter=1000,  
    random_state=42
)

In [19]:
model_lr.fit(x_train_pca_spam, y_train)

  y = column_or_1d(y, warn=True)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model_lr.predict(x_test_pca_spam)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8784
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       531
           1       0.90      0.80      0.85       390

    accuracy                           0.88       921
   macro avg       0.88      0.87      0.87       921
weighted avg       0.88      0.88      0.88       921



Decision Tree(with PCA = 5)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model_dt_spam = DecisionTreeClassifier(random_state=42, max_depth=10)  
model_dt_spam.fit(x_train_pca_spam, y_train)

y_pred = model_dt_spam.predict(x_test_pca_spam)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8990228013029316
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       531
           1       0.90      0.86      0.88       390

    accuracy                           0.90       921
   macro avg       0.90      0.89      0.90       921
weighted avg       0.90      0.90      0.90       921



With 10 dimension

In [25]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)
x_train_pca_spam = pca.fit_transform(x_train_scaled_spam)
x_test_pca_spam = pca.transform(x_test_scaled_spam)

print("x_train_pca shape:", x_train_pca_spam.shape)
print("x_test_pca shape:", x_test_pca_spam.shape)

x_train_pca shape: (3680, 10)
x_test_pca shape: (921, 10)


In [26]:
model_lr.fit(x_train_pca_spam, y_train)

  y = column_or_1d(y, warn=True)


In [27]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = model_lr.predict(x_test_pca_spam)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8979
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91       531
           1       0.92      0.83      0.87       390

    accuracy                           0.90       921
   macro avg       0.90      0.89      0.89       921
weighted avg       0.90      0.90      0.90       921



In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score


model_dt_spam = DecisionTreeClassifier(random_state=42, max_depth=10)  

model_dt_spam.fit(x_train_pca_spam, y_train)

y_pred = model_dt_spam.predict(x_test_pca_spam)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9011943539630836
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.92       531
           1       0.90      0.86      0.88       390

    accuracy                           0.90       921
   macro avg       0.90      0.90      0.90       921
weighted avg       0.90      0.90      0.90       921



With PCA 8 dimension

In [65]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)
x_train_pca_spam = pca.fit_transform(x_train_scaled_spam)
x_test_pca_spam = pca.transform(x_test_scaled_spam)

print("x_train_pca shape:", x_train_pca_spam.shape)
print("x_test_pca shape:", x_test_pca_spam.shape)

x_train_pca shape: (3680, 8)
x_test_pca shape: (921, 8)


In [None]:
model_lr.fit(x_train_pca_spam, y_train)

from sklearn.metrics import accuracy_score, classification_report, f1_score

y_pred = model_lr.predict(x_test_pca_spam)
accuracy = accuracy_score(y_test, y_pred)

f1_score = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"f1 score: {f1_score:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8871
f1 score: 0.8571
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       531
           1       0.92      0.80      0.86       390

    accuracy                           0.89       921
   macro avg       0.89      0.88      0.88       921
weighted avg       0.89      0.89      0.89       921



  y = column_or_1d(y, warn=True)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

model_dt_spam = DecisionTreeClassifier(random_state=42, max_depth=10)  
model_dt_spam.fit(x_train_pca_spam, y_train)

y_pred = model_dt_spam.predict(x_test_pca_spam)
print("f1_score:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

f1_score: 0.8748353096179183
Accuracy: 0.8968512486427795
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       531
           1       0.90      0.85      0.87       390

    accuracy                           0.90       921
   macro avg       0.90      0.89      0.89       921
weighted avg       0.90      0.90      0.90       921



Dimension reduction to 8 seems most reasonable.