# Environment

In [1]:
# HappyML install
import os

if not os.path.isdir("HappyML"):
  os.system("git clone https://github.com/cnchi/HappyML.git")

In [2]:
# Load data
datasetName="housing.csv"
datasetName = datasetName.replace(" ", "%20")
if not os.path.isfile(datasetName):
  os.system("wget https://raw.githubusercontent.com/StanOWO/1131_NTUAI_ML_Resource/main/dataset/" + datasetName)

# Preprocessing

In [3]:
import HappyML.preprocessor as pp

In [4]:
# Load Data
dataset = pp.dataset(file="housing.csv")

# 7 pieces of data is null in total_bedrooms
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None


In [5]:
# I choose this strategy because not many pieces of data contain null
# Drop the rows of data which contains null
dataset = dataset.dropna(axis=0, how='any')
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB
None


In [6]:
# X, Y decomposition
X, Y = pp.decomposition(dataset, x_columns=[i for i in range(9)], y_columns=[9])

In [7]:
#X = pp.missing_data(X, strategy="mean")
#Y = pp.missing_data(Y, strategy="mean")

In [9]:
# Label Encoding
Y, Y_mapping = pp.label_encoder(Y, mapping=True)

print(Y)
print(Y_mapping)

       ocean_proximity
0                    3
1                    3
2                    3
3                    3
4                    3
...                ...
20635                1
20636                1
20637                1
20638                1
20639                1

[20433 rows x 1 columns]
{0: '<1H OCEAN', 1: 'INLAND', 2: 'ISLAND', 3: 'NEAR BAY', 4: 'NEAR OCEAN'}


In [10]:
# Note:Here are some dimensionality reduction algorithms(inessential)

# Feature Selection

# from sklearn.decomposition import PCA
# pca = PCA(n_components=None)
# pca.fit(X)
# info_covered = pca.explained_variance_ratio_
# cumulated_sum = np.cumsum(info_covered)
# plt.plot(cumulated_sum, color="blue")

# pca = PCA(n_components=2)
# X_columns = ["PCA_{}".format(i+1) for i in range(2)]
# X = pd.DataFrame(pca.fit_transform(X), index=X.index, columns=X_columns)
#from HappyML.preprocessor import PCASelector

#selector = PCASelector(best_k="auto")
#X = selector.fit(x_ary=X, verbose=True, plot=True).transform(X)

In [11]:
# Split Training / TEsting Set
X_train, X_test, Y_train, Y_test = pp.split_train_test(X, Y, train_size=0.8, random_state=0)

In [12]:
# Feature Scaling
X_train, X_test = pp.feature_scaling(fit_ary=X_train, transform_arys=(X_train, X_test))

# SVM

In [21]:
# # from sklearn.svm import SVC
# # import time

# # classifier = SVC(C=1.0, kernel="rbf", gamma="scale", random_state=int(time.time()))
# # classifier.fit(X_train, Y_train.values.ravel())
# # Y_pred = classifier.predict(X_test)

from HappyML.classification import SVM

classifier = SVM()
Y_pred = classifier.fit(X_train, Y_train).predict(X_test)

In [22]:
# from sklearn.model_selection import cross_val_score

# k_fold = 10
# accuracies = cross_val_score(estimator=classifier.classifier, X=X, y=Y.values.ravel(), scoring="accuracy", cv=k_fold, n_jobs=-1)
# print("{} Folds Mean Accuracy: {}".format(k_fold, accuracies.mean()))

# recalls = cross_val_score(estimator=classifier.classifier, X=X, y=Y.values.ravel(), scoring="recall", cv=k_fold, n_jobs=-1)
# print("{} Folds Mean Recall: {}".format(k_fold, recalls.mean()))

# precisions = cross_val_score(estimator=classifier.classifier, X=X, y=Y.values.ravel(), scoring="precision", cv=k_fold, n_jobs=-1)
# print("{} Folds Mean Precision: {}".format(k_fold, precisions.mean()))

# f_scores = cross_val_score(estimator=classifier.classifier, X=X, y=Y.values.ravel(), scoring="f1", cv=k_fold, n_jobs=-1)
# print("{} Folds Mean F1-Score: {}".format(k_fold, f_scores.mean()))
from HappyML.performance import KFoldClassificationPerformance

K = 10
kfp = KFoldClassificationPerformance(x_ary=X, y_ary=Y, classifier=classifier.classifier, k_fold=K, verbose=False)

print("----- SVM Classification -----")
print("{} Folds Mean Accuracy: {}".format(K, kfp.accuracy()))
print("{} Folds Mean Recall: {}".format(K, kfp.recall()))
print("{} Folds Mean Precision: {}".format(K, kfp.precision()))
print("{} Folds Mean F1-Score: {}".format(K, kfp.f_score()))

----- SVM Classification -----
10 Folds Mean Accuracy: 0.607398419307779
10 Folds Mean Recall: 0.34966466190592266
10 Folds Mean Precision: 0.2895209714285347
10 Folds Mean F1-Score: 0.3082967689583006


# Decision Tree

In [23]:
from HappyML.classification import DecisionTree

classifier = DecisionTree()
Y_pred = classifier.fit(X_train, Y_train).predict(X_test)

In [24]:
from HappyML.performance import KFoldClassificationPerformance

K = 10
kfp = KFoldClassificationPerformance(x_ary=X, y_ary=Y, classifier=classifier.classifier, k_fold=K)

print("----- Decision Tree Classification -----")
print("{} Folds Mean Accuracy: {}".format(K, kfp.accuracy()))
print("{} Folds Mean Recall: {}".format(K, kfp.recall()))
print("{} Folds Mean Precision: {}".format(K, kfp.precision()))
print("{} Folds Mean F1-Score: {}".format(K, kfp.f_score()))

----- Decision Tree Classification -----
10 Folds Mean Accuracy: 0.801289161692879
10 Folds Mean Recall: 0.7990757561816447
10 Folds Mean Precision: 0.8299658126117009
10 Folds Mean F1-Score: 0.7887126081218266


# Random Forest

In [25]:
# from sklearn.ensemble import RandomForestClassifier
# import time

# classifier = RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=int(time.time()))
# classifier.fit(X_train, Y_train.values.ravel())
# Y_pred = classifier.predict(X_test)

# With HappyML's Class
from HappyML.classification import RandomForest

classifier = RandomForest(n_estimators=10, criterion="entropy")
Y_pred = classifier.fit(X_train, Y_train).predict(X_test)

In [26]:
from HappyML.performance import KFoldClassificationPerformance

K = 10
kfp = KFoldClassificationPerformance(x_ary=X, y_ary=Y, classifier=classifier.classifier, k_fold=K)

print("----- Random Forest Classification -----")
print("{} Folds Mean Accuracy: {}".format(K, kfp.accuracy()))
print("{} Folds Mean Recall: {}".format(K, kfp.recall()))
print("{} Folds Mean Precision: {}".format(K, kfp.precision()))
print("{} Folds Mean F1-Score: {}".format(K, kfp.f_score()))

----- Random Forest Classification -----
10 Folds Mean Accuracy: 0.7942433856048001
10 Folds Mean Recall: 0.727000823976647
10 Folds Mean Precision: 0.7584025389066549
10 Folds Mean F1-Score: 0.7208829300239856
