In [694]:
# define source to download
import os
import zipfile

PRIME_LIMITS = 1000000
PRIMES_ZIP_FILE_NAME = "primes1.zip"
PRIMES_EXTRACT_FILE_NAME = "primes1.txt"
PRIMES_PATH = "datasets/primes"

def extract_primes_data(primes_path=PRIMES_PATH, primes_zip_file_name=PRIMES_ZIP_FILE_NAME, primes_extract_file_name=PRIMES_EXTRACT_FILE_NAME):
    if not os.path.isdir(primes_path):
        raise Exception('Create the path: datasets/primes and copy the ' + primes_zip_file_name + ' within the path')
    zip_path = os.path.join(primes_path, primes_zip_file_name)
    if not os.path.exists(zip_path):
        raise Exception('Download primes zip from URL=https://primes.utm.edu/lists/small/millions/')
    primes_zip = zipfile.ZipFile(zip_path)
    txt_path = os.path.join(primes_path, primes_extract_file_name)
    with open(txt_path, 'w') as f:
        data = primes_zip.read(primes_extract_file_name)
        # split into line of data
        data = data.splitlines(True)
        # remove first lines due to header text type
        data = data[4:]
        new_list = []
        # filter blank lines and reduce the result to one array
        for line in data[:]:
            values = line.decode("utf-8").split()
            if len(values) > 0:
                for num in values:
                    new_list.append(num)
        text = "\n".join(new_list)
        f.write(text)
        f.close()

In [695]:
# download and extract file
extract_primes_data()

In [696]:
# define method for loading extracted data
import pandas as pd

def load_primes_data(primes_path=PRIMES_PATH, primes_extract_file_name=PRIMES_EXTRACT_FILE_NAME, limit=None):
    csv_path = os.path.join(primes_path, primes_extract_file_name)
    return pd.read_csv(csv_path, sep='\t+|\r+|\n+|\s+', nrows=limit, header=None, engine='python')

In [None]:
# load data from prepared file
primes_data = load_primes_data(limit=PRIME_LIMITS).values.flatten()
primes_data[:10]

array([ 2,  3,  5,  7, 11, 13, 17, 19, 23, 29])

In [None]:
import numpy as np

# calculate non primes data
non_primes = []
for val in range(len(primes_data)):
    non_primes.append(val)

non_primes = [x for x in non_primes if x not in primes_data]
non_primes_data = np.array(non_primes)

In [None]:
print(non_primes_data[:10])
print(non_primes_data[:-10])

In [None]:
# define labels for primes
y_prime = (primes_data == primes_data)
y_prime[:10]

In [None]:
# define lables for non-primes
y_non_prime = (non_primes_data != non_primes_data)
y_non_prime[:10]

In [None]:
all_numbers = np.concatenate((primes_data, non_primes_data))
all_labels = np.concatenate((y_prime, y_non_prime))

In [None]:
# compose dataset
data = {'values' : pd.Series(all_numbers),
        'labels' : pd.Series(all_labels)}
data_frame = pd.DataFrame(data)

In [None]:
data_frame.head()

In [None]:
# plot a histogram
%matplotlib inline
import matplotlib.pyplot as plt

data_frame.hist()
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data_frame, data_frame["labels"]):
    train_set = data_frame.loc[train_index]
    test_set = data_frame.loc[test_index]

In [None]:
# check the stratified split
print("original data proportions\n", data_frame["labels"].value_counts() / len(data_frame))
print("stratified test proportions\n", test_set["labels"].value_counts() / len(test_set))

In [None]:
train_set["labels"].value_counts()

In [None]:
test_set["labels"].value_counts()

In [None]:
y_train = train_set["labels"]
X_train = train_set["values"].values.reshape(-1, 1)
print(X_train.shape)
print(y_train.shape)
print(X_train)
print("X_train count:", len(X_train))
print("y_train count:", len(y_train))

# Train Binary Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

In [None]:
# cross-validation above using the cross_val_score
from sklearn.model_selection import cross_val_score
# use cross-validation k-fold = 3 and measure the accuracy in percent
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

# Measuring Classifier Accuracy

In [None]:
# an good alternative instead of using the scores we can retrieve the cross-validation predictions 
# and build a confusion matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

confusion_matrix(y_train, y_train_pred)

In [None]:
# to visualize the tradeoff of precision and recall we can use the decision_function instead of predict returning
# scores instead of predictions
y_scores = cross_val_predict(sgd_clf, X_train, y_train, cv=3, method="decision_function")

# calculate values for ROC curve
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Retrain with RandomForest Classifier

In [None]:
# we are now trying to improve the Precision/Recall using a RandomForest classifier
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
# some classifiers do not provide a decision_function method but a dict_proba method due to their type of 
# classification -> to use the dict_proba, we parametrize the cross_val_predict with its predict_proba parameter
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train, cv=3, method="predict_proba")

# to plot a ROC curve we need to transform the prbabilities to scores
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train, y_scores_forest)


In [None]:
y_train_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3)

confusion_matrix(y_train, y_train_pred)

In [None]:
# plot the ROC curve
plt.plot(fpr, tpr, "b:", label="SGD")
plt.plot(fpr_forest, tpr_forest, label="Random Forest")
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.legend(loc="lower right")
plt.show()

In [None]:
# calculate the precisions, recalls and thresholds to plot the tradeoff
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)
forest_precisions, forest_recalls, thresholds = precision_recall_curve(y_train, y_scores_forest)

# plot precision vs recall
plt.plot(recalls, precisions, "b:", label="SGD")
plt.plot(forest_recalls, forest_precisions, "g-", label="Random Forest")
plt.ylabel("Precision")
plt.xlabel("Recall")
plt.ylim([0, 1])
plt.xlim([0, 1])
plt.legend(loc="upper right")
plt.show()

# Analyse Data

In [None]:
data_frame.sort_values(["values"], ascending=[True])

In [None]:
data_frame.plot(kind="scatter", x="values", y="values", alpha=0.8, s=data_frame["labels"]/data_frame["values"], label="prime")

In [None]:
from sklearn.neural_network import MLPClassifier

nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100, 50, 20), random_state=42)
nn_clf.fit(X_train, y_train)

In [None]:
y_train_pred = cross_val_predict(nn_clf, X_train, y_train, cv=3)

confusion_matrix(y_train, y_train_pred)