In [None]:
import sys

# Select path to current folder and split by \\
main_path = sys.path[0].split("\\")

# Asssign path to parent folder
# path_to_parent allows access to any folder from within parent folder, no matter the location of this file within the parent folder
# i.e.: Don't need to specify "../" x amount of times
path_to_parent = []
for element in main_path:
    path_to_parent.append(element)
    if "Fake_Users_Movies_Classifier" == element:
        break

path_to_parent = "\\".join(path_to_parent)

# Add path to feature generation folder
sys.path.append(path_to_parent+"\\feature_generation")

In [None]:
# Import feature generator for week 1
from feature_gen_wk1 import feature_gen

# Create string path to labelled data
path_to_file = path_to_parent + "/labelled_data/first_batch_with_labels_likes.npz"
# Generate features from file
df_final = feature_gen().retrieveAndGenerate(path_to_file)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Retrieve labels and assign to y
y = df_final['label']
# Remove labels and users from dataset and assign to x
X = df_final.drop(['user','label'],axis=1)

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialize and train a logistic regression model
logreg = LogisticRegression(max_iter=1000, random_state=42, solver='newton-cg', penalty='l2', C=10, class_weight={1:1, 0:1.1})
logreg.fit(X_train_scaled, y_train)

# Predicting probabilities for the validation set
logreg_probs = logreg.predict_proba(X_val_scaled)[:, 1]
logreg_auc = roc_auc_score(y_val, logreg_probs)

# Calculate precision, recall and f1 scores
precision = precision_score(y_val, logreg.predict(X_val_scaled))
recall = recall_score(y_val, logreg.predict(X_val_scaled))
f1 = f1_score(y_val, logreg.predict(X_val_scaled))

# Printing the results
print("AUC:", logreg_auc)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)