In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Load the dataset into a pandas DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.3)

# Instantiate a DecisionTreeClassifier object with the desired hyperparameters
out = DecisionTreeClassifier(max_depth=5)

# Train the decision tree classifier using the training set
out.fit(X_train, y_train)

# Use the trained model to make predictions on the testing set
y_pred = out.predict(X_test)

# Evaluate the performance of the model using appropriate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))


Accuracy: 0.9022447501810282
Precision: 0.9065040650406504
Recall: 0.8336448598130841
F1-score: 0.8685491723466406


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

# Load the dataset into a pandas DataFrame
#data = pd.read_csv('spambase.csv', header=None)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.3)

# Instantiate a DecisionTreeClassifier object with the desired hyperparameters
out = DecisionTreeClassifier(max_depth=5)

#Apply PCA to reduce ...
pca=PCA(n_components=10)
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.transform(X_test)

# Train the decision tree classifier using the training set
out.fit(X_train_pca, y_train)

# Use the trained model to make predictions on the testing set
y_pred = out.predict(X_test_pca)

# Evaluate the performance of the model using appropriate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))


Accuracy: 0.833454018826937
Precision: 0.7671009771986971
Recall: 0.8440860215053764
F1-score: 0.8037542662116042


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

# Load the dataset into a pandas DataFrame
#data = pd.read_csv('spambase.csv', header=None)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.3)

# Instantiate a DecisionTreeClassifier object with the desired hyperparameters
out = DecisionTreeClassifier(max_depth=5)

#Apply PCA to reduce ...
pca=PCA(n_components=57)
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.transform(X_test)

# Train the decision tree classifier using the training set
out.fit(X_train_pca, y_train)

# Use the trained model to make predictions on the testing set
y_pred = out.predict(X_test_pca)

# Evaluate the performance of the model using appropriate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))


Accuracy: 0.8703837798696596
Precision: 0.8299065420560747
Recall: 0.8345864661654135
F1-score: 0.8322399250234302


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

# Load the dataset into a pandas DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Check for missing values
print(data.isnull().sum())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.3)

# Check for categorical variables
print(X_train.dtypes)

# Instantiate a DecisionTreeClassifier object with the desired hyperparameters
out = DecisionTreeClassifier(max_depth=5)

# Apply PCA to reduce dimensionality
pca=PCA(n_components=57)
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.transform(X_test)

# Train the decision tree classifier using the training set
out.fit(X_train_pca, y_train)

# Use the trained model to make predictions on the testing set
y_pred = out.predict(X_test_pca)

# Evaluate the performance of the model using appropriate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
50    0
51    0
52    0
53    0
54    0
55    0
56    0
57    0
dtype: int64
0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64
16    float64
17    float64
18    float64
19    float64
20    float64
21    float64
22    float64
23    float64
24    float64
25    float64
26    float64
27    float64
28    float64
29    float64
30    float64
31    float64
32    float64
33    float64
34    float64
35    float64
36    float64
37   

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

# Load the dataset into a pandas DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.3, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Instantiate a DecisionTreeClassifier object with the desired hyperparameters
out = DecisionTreeClassifier(max_depth=5, random_state=42)

# Train the decision tree classifier using the training set
out.fit(X_train, y_train)

# Use the trained model to make predictions on the testing set
y_pred = out.predict(X_test)

# Evaluate the performance of the model using appropriate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))


Accuracy: 0.9000724112961622
Precision: 0.9180952380952381
Recall: 0.8353552859618717
F1-score: 0.8747731397459165
