In [42]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [43]:
df = pd.read_csv('../CSV/sml_isbn_info.csv')
df.head()

Unnamed: 0,ISBN,Median_Reader_Age,Country_Count,Read_Count,Median_Book_Rating,Year_Of_Publication,Popular (>= 8),Unpopular (<= 3)
0,000104799X,59.0,1,2,7.5,1994.0,False,False
1,000160418X,61.0,1,1,7.0,1984.0,False,False
2,000215871X,42.0,1,1,7.0,1992.0,False,False
3,000221766X,60.0,1,1,8.0,1987.0,True,False
4,000222674X,51.0,1,1,9.0,1982.0,True,False


In [44]:
df.dtypes

ISBN                    object
Median_Reader_Age      float64
Country_Count            int64
Read_Count               int64
Median_Book_Rating     float64
Year_Of_Publication    float64
Popular (>= 8)            bool
Unpopular (<= 3)          bool
dtype: object

In [45]:
X = df.drop(['Popular (>= 8)', 'Unpopular (<= 3)', 'ISBN', 'Median_Book_Rating'], axis=1)

y = df['Popular (>= 8)']

In [46]:
y.value_counts()

True     67175
False    48305
Name: Popular (>= 8), dtype: int64

In [47]:
le = LabelEncoder()

y = le.fit_transform(y)

In [48]:
y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, stratify=y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(86610, 4)
(28870, 4)
(86610,)
(28870,)


In [50]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

In [None]:
# RANDOM FOREST

In [51]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model = rf_model.fit(X_train, y_train)
predictions = rf_model.predict(X_test)

In [52]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.536643574064666

In [53]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[ 3762,  8314],
       [ 4001, 12793]], dtype=int64)

In [54]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.48      0.31      0.76      0.38      0.49      0.23     12076
          1       0.61      0.76      0.31      0.68      0.49      0.25     16794

avg / total       0.56      0.57      0.50      0.55      0.49      0.24     28870



In [55]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.4835229805723057, 'Median_Reader_Age'),
 (0.35821315060937914, 'Year_Of_Publication'),
 (0.10952504668308528, 'Read_Count'),
 (0.04873882213522986, 'Country_Count')]

In [None]:
# ADABOOST

In [56]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

predictions = eec.predict(X_test)

In [57]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.5596245364448407

In [58]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[ 8781,  3295],
       [10209,  6585]], dtype=int64)

In [59]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.46      0.73      0.39      0.57      0.53      0.29     12076
          1       0.67      0.39      0.73      0.49      0.53      0.28     16794

avg / total       0.58      0.53      0.59      0.52      0.53      0.28     28870



In [60]:
# LOGISTIC REGRESSION

In [61]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)

classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [62]:
from sklearn.metrics import accuracy_score

predictions = classifier.predict(X_test)
accuracy_score(y_test, predictions)

0.5812954624177347

In [None]:
# LINEAR REGRESSION