In [None]:
import sys

# Select path to current folder and split by \\
main_path = sys.path[0].split("\\")

# Asssign path to parent folder
# path_to_parent allows access to any folder from within parent folder, no matter the location of this file within the parent folder
# i.e.: Don't need to specify "../" x amount of times
path_to_parent = []
for element in main_path:
    path_to_parent.append(element)
    if "Fake_Users_Movies_Classifier" == element:
        break

path_to_parent = "\\".join(path_to_parent)

# Add path to feature generation folder
sys.path.append(path_to_parent+"\\feature_generation")

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Lars
from sklearn.linear_model import LinearRegression, Ridge, OrthogonalMatchingPursuit, LassoLarsCV, LassoLarsIC
from sklearn.linear_model import ARDRegression, BayesianRidge, PoissonRegressor, TweedieRegressor, GammaRegressor
from sklearn.linear_model import HuberRegressor, TheilSenRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Create a list of classifier names
names = [
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "Logistic Regression",
    "Ridge Classifier",
    "SGD Classifier",
    "Lars",
    "Linear Regression",
    "Ridge",
    "Orthogonal Matching Pursuit",
    "LassoLars CV",
    "LassoLars IC",
    "ARD Regression",
    "Bayesian Ridge",
    "Huber Regressor",
    "Theil Sen Regressor",
    "Poisson Regressor",
    "Tweedie Regressor",
]

# Create a list of classifiers
classifiers = [
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    LogisticRegression(max_iter=10000, random_state=42),
    RidgeClassifier(max_iter=10000, random_state=42),
    SGDClassifier(max_iter=10000, random_state=42),
    Lars(random_state=42),
    LinearRegression(),
    Ridge(random_state=42),
    OrthogonalMatchingPursuit(),
    LassoLarsCV(),
    LassoLarsIC(),
    ARDRegression(),
    BayesianRidge(),
    HuberRegressor(),
    TheilSenRegressor(),
    PoissonRegressor(),
    TweedieRegressor(),
]

In [None]:
# Import feature generator for week 1
from feature_gen_wk1 import feature_gen

# Create string path to labelled data
path_to_file = path_to_parent + "/data/labelled_data/first_batch_with_labels_likes.npz"
# Generate features from file
df_final = feature_gen().retrieveAndGenerate(path_to_file)

In [None]:
from assessors import assessFeatures

hist = []
classifier = LogisticRegression()

# Retrieve labels and assign to y
y = df_final['label']
# Remove labels and users from dataset and assign to x
X = df_final.drop(['user','label'],axis=1)

# Splitting the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize feature selection class
assessor = assessFeatures(X_train, y_train)

# Loop through amount of columns: from 1 to column count - 2
for i in range(1,len(X_train.columns)-2):
    # Create a dictionary with classifier scores and best score achieved
    hist_inner = {i:[], "best_score":0}
    
    # Apply RFE feature selection algorithm to train set and return modified train set
    # Feature amount output is denoted by the value i
    # Additional Information:
    #       This algorithm can be changed to test other ones such as:
    #               -boruta
    #               -chi2
    #               -ANOVA
    best_features = assessor.select_by_RFE(classifier, i)

    # Find columns used in modified train set
    columns = set(best_features.columns)&set(X_train.columns)

    # Assign train and test sets with new features
    X_train_inner = best_features
    X_test_inner = X_test[columns]

    # Iterate over classifiers and calculate scores
    for name, clf in zip(names, classifiers):

        # Create a pipeline to scale input data before training on classifier
        clf = make_pipeline(StandardScaler(), clf)
        # Train classifier on train data
        clf.fit(X_train_inner, y_train)

        # Predict test labels and calculate AUC score
        y_pred = clf.predict(X_test_inner)
        score = roc_auc_score(y_test, y_pred)

        # Append AUC score to list inside hist_inner dictionary
        hist_inner[i].append({'clf':name, 'auc':score})

        # Check if score is highest in current set of classifiers
        if score > hist_inner['best_score']:
            # If yes, set it as the highest
            hist_inner['best_score'] = score

    # Append performance of classifiers for this iteration to list
    hist.append(hist_inner)


In [None]:
# Convert list into dataframe and sort by best_score
results_table = pd.DataFrame(hist).sort_values(by='best_score', ascending=False)
# Display results dataframe
results_table