In [None]:
import sys

# Select path to current folder and split by \\
main_path = sys.path[0].split("\\")

# Asssign path to parent folder
# path_to_parent allows access to any folder from within parent folder, no matter the location of this file within the parent folder
# i.e.: Don't need to specify "../" x amount of times
path_to_parent = []
for element in main_path:
    path_to_parent.append(element)
    if "Fake_Users_Movies_Classifier" == element:
        break

path_to_parent = "\\".join(path_to_parent)

# Add path to feature generation folder
sys.path.append(path_to_parent+"\\feature_generation")

In [None]:
# Import feature generator for week 1
from feature_gen_wk1 import feature_gen

# Create string path to labelled data
path_to_file = path_to_parent + "/data/labelled_data/first_batch_with_labels_likes.npz"
# Generate features from file
df_final = feature_gen().retrieveAndGenerate(path_to_file)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score


# Retrieve labels and assign to y
y = df_final['label']
# Remove labels and users from dataset and assign to x
X = df_final.drop(['user','label'],axis=1)

# Splitting the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply scaler on train and validation sets
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Set the amount of polynomial features
pol_ftrs = 4

# Initialize Logistic Regression and Polynomial Features
lr = LogisticRegression(solver="saga", max_iter=1000000, penalty='l2', tol=1e-5)
pol_feat = PolynomialFeatures(pol_ftrs, interaction_only=True)

# Create polynomial features from train and validation data
X2_train, X2_val = pol_feat.fit_transform(X_train), pol_feat.fit_transform(X_val)

# Train logistic regression using train set
lr.fit(X2_train, Y_train)
# Predict validation set labels
polreg_preds = lr.predict(X2_val)

# Predicting probabilities for the validation set
polreg_auc = roc_auc_score(Y_val, polreg_preds)

# Calculate precision, recall and f1 scores
precision = precision_score(Y_val, polreg_preds)
recall = recall_score(Y_val, polreg_preds)
f1 = f1_score(Y_val, polreg_preds)

# Printing the results
print("AUC:", polreg_auc)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)