In [None]:
import pandas as pd
from sklearn.utils import resample
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

# Load the CSV file into a DataFrame
df = pd.read_csv('Behavioral_data_final.csv')


#### Downsampling

df_majority = df[df['Privacy Exists']==0]
df_minority = df[df['Privacy Exists']==1]

df_majority_downsampled = resample(df_majority,
                                 replace=False,    # sample without replacement
                                 n_samples=len(df_minority),     # to match minority class
                                 random_state=123) # reproducible results
df_downsampled = pd.concat([df_majority_downsampled, df_minority])


class0_qty = len(df_downsampled[df_downsampled['Privacy Exists'] == 0])
class1_qty = len(df_downsampled[df_downsampled['Privacy Exists'] == 1])


X = df_downsampled.iloc[:, 1:-1].values
Y = df_downsampled.iloc[:, -1:].values

column_names = df.columns[1:-1].tolist()

print(X.shape)
print(Y.shape)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler();
X = scaler.fit_transform(X)

# Shuffle the rows of the input and output arrays
shuffle_idx = np.random.permutation(X.shape[0])
X_new = X[shuffle_idx]
Y_new = Y[shuffle_idx]





models = {
    'Logistic Regression': LogisticRegression(C=10, max_iter=100),
    'Random Forest': RandomForestClassifier(max_depth=5, n_estimators=200),
    'Gradient Boosting': GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=100),
    'SVM': SVC(C=0.1, kernel='rbf')
}

# Define the k-fold cross-validation splitter
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# PCA

df_results = pd.DataFrame(columns=[
    'id',
    'Class 0 quantity',
    'Class 1 quantity',
    'classifier',
    'cross validation',
    'method',
    'number of features',
    'variance captured',
    'train accuracy',
    'test accuracy'
])

# Loop over the feature numbers
j = 0


# Create an empty dictionary to store the PCA results
pca_results = {}

for number_of_best_feature in range(5, X_new.shape[1] + 1, 5):
    # Apply PCA only if the PCA results are not already in the dictionary
    if number_of_best_feature not in pca_results:
        pca = PCA(n_components=number_of_best_feature)
        X_pca = pca.fit_transform(X_new)
        cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

        # Store the PCA results in the dictionary
        pca_results[number_of_best_feature] = {
            'X_pca': X_pca,
            'cumulative_variance': cumulative_variance
        }

# Loop over the models and feature numbers
for name, model in models.items():
    for number_of_best_feature in range(5, X_new.shape[1] + 1, 5):
        # Retrieve X_pca and cumulative_variance from the dictionary
        pca_data = pca_results[number_of_best_feature]
        X_pca = pca_data['X_pca']
        cumulative_variance = pca_data['cumulative_variance']

        train_accuracies = []
        test_accuracies = []

        for train_idx, test_idx in skf.split(X_pca, Y_new):
            # Split the data into training and testing sets
            X_train, X_test = X_pca[train_idx], X_pca[test_idx]
            y_train, y_test = Y_new[train_idx], Y_new[test_idx]

            # Train the model
            model.fit(X_train, y_train)

            # Predict on the training and testing sets
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)

            # Predict on the training and testing sets
            y_train_pred = model.predict(X_train).reshape(-1, 1)
            y_test_pred = model.predict(X_test).reshape(-1, 1)



            # Calculate the accuracy on the training and testing sets
            train_accuracy = accuracy_score(y_train_pred, y_train)
            test_accuracy = accuracy_score(y_test_pred, y_test)

            # Append the accuracies to the lists
            train_accuracies.append(train_accuracy)
            test_accuracies.append(test_accuracy)

        train_accuracy_mean = np.mean(train_accuracies)
        test_accuracy_mean = np.mean(test_accuracies)

        # Add the results to the DataFrame
        df_results = df_results.append({
            'id': f'{j + 1:02d}',
            'Class 0 quantity': class0_qty,
            'Class 1 quantity': class1_qty,
            'classifier': name,
            'cross validation': '5-Fold',
            'method': 'PCA',
            'number of features': number_of_best_feature,
            'variance captured': cumulative_variance[-1],
            'train accuracy': train_accuracy_mean,
            'test accuracy': test_accuracy_mean
        }, ignore_index=True)

        j += 1

        print('For model:', name)
        print('Number of feature:', number_of_best_feature)
        print('Train accuracy:', train_accuracy_mean)
        print('Test accuracy:', test_accuracy_mean)

# Specify the file path for saving the HTML file
output_file_path = r'C:\Users\ASAF\PycharmProjects\pythonProject1\results_PCA_sklearn.html'

# Convert DataFrame to HTML table
html_table = df_results.to_html(index=False)

# Write HTML table to a file
with open(output_file_path, 'w') as file:
    file.write(html_table)

# K-best


df_results = pd.DataFrame(columns=[
    'id',
    'Class 0 quantity',
    'Class 1 quantity',
    'classifier',
    'cross validation',
    'method',
    'number of features',
    'features_name',
    'feature score',
    'train accuracy',
    'test accuracy'
])

# Loop over the feature numbers
j = 0

# Create an empty dictionary to store the feature selection results
feature_selection_results = {}

for number_of_best_feature in range(5, X_new.shape[1] + 1, 5):
    # Apply feature selection only if the results are not already in the dictionary
    if number_of_best_feature not in feature_selection_results:
        selector = SelectKBest(mutual_info_classif, k=number_of_best_feature)

        # Fit the selector on the training data
        selector.fit(X_new, Y_new)
        feature_indices = selector.get_support(indices=True)
        feature_names = [column_names[i] for i in feature_indices]
        feature_scores = selector.scores_
        feature_score = feature_scores[feature_indices]

        # Store the feature selection results in the dictionary
        feature_selection_results[number_of_best_feature] = {
            'feature_indices': feature_indices,
            'feature_names': feature_names,
            'feature_scores': feature_score
        }

# Loop over the models and feature numbers
for name, model in models.items():
    for number_of_best_feature in range(5, X_new.shape[1] + 1, 5):
        # Retrieve feature selection results from the dictionary
        feature_selection_data = feature_selection_results[number_of_best_feature]
        feature_indices = feature_selection_data['feature_indices']
        feature_names = feature_selection_data['feature_names']
        feature_scores = feature_selection_data['feature_scores']

        # Transform the training and testing data based on the selected features
        X_selected = X_new[:, feature_indices]


        train_accuracies = []
        test_accuracies = []

        for train_idx, test_idx in skf.split(X_selected, Y_new):
            # Split the data into training and testing sets
            X_train, X_test = X_selected[train_idx], X_selected[test_idx]
            y_train, y_test = Y_new[train_idx], Y_new[test_idx]

            # Train the model
            model.fit(X_train, y_train)

            # Predict on the training and testing sets
            y_train_pred = model.predict(X_train).reshape(-1, 1)
            y_test_pred = model.predict(X_test).reshape(-1, 1)



            # Calculate the accuracy on the training and testing sets
            train_accuracy = accuracy_score(y_train_pred, y_train)
            test_accuracy = accuracy_score(y_test_pred, y_test)

            # Append the accuracies to the lists
            train_accuracies.append(train_accuracy)
            test_accuracies.append(test_accuracy)

        train_accuracy_mean = np.mean(train_accuracies)
        test_accuracy_mean = np.mean(test_accuracies)

        # Add the results to the DataFrame
        df_results = df_results.append({
            'id': f'{j + 1:02d}',
            'Class 0 quantity': class0_qty,
            'Class 1 quantity': class1_qty,
            'classifier': name,
            'cross validation': '5-Fold',
            'method': 'SelectKBest',
            'number of features': number_of_best_feature,
            'features_name': ', '.join(map(str, feature_names)),
            'feature score': feature_scores,
            'train accuracy': train_accuracy_mean,
            'test accuracy': test_accuracy_mean
        }, ignore_index=True)

        j += 1

        print('For model:', name)
        print('Number of feature:', number_of_best_feature)
        print('Train accuracy:', train_accuracy_mean)
        print('Test accuracy:', test_accuracy_mean)

# Specify the file path for saving the HTML file
output_file_path = r'C:\Users\ASAF\PycharmProjects\pythonProject1\results_SelectKBest_sklearn_new_kfold.html'

# Convert DataFrame to HTML table
html_table = df_results.to_html(index=False)

# Write HTML table to a file
with open(output_file_path, 'w') as file:
    file.write(html_table)
