In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy import stats
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

 current_app - understand what kind of application is used in what way (I am not sure this will be used)<br>
 keystroke_counter - how many symbols user is inputing to understand patterns is typing<br>
 erase_keys_counter - if user types exceptionaly well, he will still use erase somehow, trust me<br>
 changes_between_apps - how ofternhe is trying to switch between apps<br>
 press_press_average_interval - how fast somebody physically press buttons, computer don't spend time on this<br>
 press_release_average_interval - computer will have them identical all over the place, if it's not advanced enough<br>
 received_bytes - did it try to download something?<br>
 sent_bytes - did it try to send something too large like the entire file system?<br>
 mouse_average_movement_speed - people on average move the mouse the same way across all the applications<br>

In [2]:
def meanNoOutliners(values):
  # Define threshold for outliers (e.g., values greater than 3 standard deviations from mean)
  threshold = 3
  mean_key = np.mean(values)
  std_key = np.std(values)
  outliers_key = values[np.abs(values - mean_key) > threshold * std_key]

  # Remove outliers from the array
  numbers_no_outliers = values[np.abs(values - mean_key) <= threshold * std_key]

  # Calculate mean after removing outliers
  mean_no_outliners = np.mean(numbers_no_outliers)
  return mean_no_outliners



In [3]:
def calculate_word_weights(word_list):
    # Initialize an empty dictionary to store counts
    word_counts = {}

    # Update the counts in the dictionary
    for word in word_list:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1

    # Create a two-dimensional array with word occurrences
    word_occurrences = [[word, word_counts[word]] for word in word_counts]

    return word_occurrences


In [4]:
def calculate_quartiles(numbers):
    
    q1 = np.percentile(numbers, 25)
    q2 = np.percentile(numbers, 50)
    q3 = np.percentile(numbers, 75)
    outline_border = np.percentile(numbers, 99) #probably needs adjustment
    
    return q1, q2, q3, outline_border


In [5]:
def vulnerability_ranking(number,values):
    # Call the function and store the quartile values in a tuple
    quarts = calculate_quartiles(values)

    # Access the quartile values from the tuple
    q1 = quarts[0]
    q2 = quarts[1]
    q3 = quarts[2]
    outline_border = quarts[3]

    ranking = 0

    if (number < q1):
        ranking = 0
    
    if (number >= q1 and number <= q2):
        ranking = 0.25
    
    if (number >= q2 and number <= q3):
        ranking = 0.5
    
    if (number >= q3 and number <= outline_border):
        ranking = 0.75
    
    if (number >= outline_border):
        ranking = 1

    return ranking; 


In [6]:
def criticality(rank):
    max_rank = 28

    classification = []

    labels = { 'legit':0,
                'low':1,
                'medium':2,
                'high':3,
                'critical':4
             }
    
    for i in range(len(rank)):
        if rank[i] <= (max_rank * 0.25):
            classification.append('legit')
        elif (max_rank * 0.25) < rank[i] <= (max_rank * 0.50):
            classification.append('low')
        elif (max_rank * 0.50) < rank[i] <= (max_rank * 0.75):
            classification.append('medium')
        elif (max_rank * 0.75) < rank[i] <= (max_rank * 0.95):
            classification.append('high')
        else:
            classification.append('critical')

    return classification
        

 ## I have decided to proceed with this list of arguments by their importance in decending order:<br>
     1. sent_bytes (factor = 5)
     2. keystroke_counter (factor = 5)
     3. erase_keys_counter (factor = 5)
     4. recieved_bytes (factor = 4)
     5. currrent_apps (factor = 3)
     6. mouse_move_speed (factor = 2)
     7. changes_between_apps (factor = 2)
     8. press_release_average_interval (factor = 1)
     9. press_press_average_interval (factor = 1)

In [7]:
def analysis(dataframe):

    # word_weights = calculate_word_weights(dataframe['current_app'])
    mean_keystroke = meanNoOutliners(dataframe['keystroke_counter'])
    mean_erase = meanNoOutliners(dataframe['erase_keys_counter'])
    mean_apps_swtich = meanNoOutliners(dataframe['changes_between_apps'])
    mean_press_interval = meanNoOutliners(dataframe['press_press_average_interval'])
    mean_press_release_interval = meanNoOutliners(dataframe['press_release_average_interval'])
    mean_mouse_speed = meanNoOutliners(dataframe['mouse_average_movement_speed'])
    mean_sent = (meanNoOutliners(dataframe['sent_bytes'])/1024/1024) # in MB
    mean_received = (meanNoOutliners(dataframe['received_bytes'])/1024/1024) # in MB

    # Define a dictionary to map letters to assigned values
    factor = {
        'keystroke_counter':5,
        'erase_keys_counter':5,
        'changes_between_apps':2,
        'press_press_average_interval':1,
        'press_release_average_interval':1,
        'received_bytes' :4,
        'sent_bytes' :5,
        'mouse_average_movement_speed':2
    }

    app_rank = 0
    keystoke_rank = 0
    erase_rank = 0
    app_switch_rank = 0
    press_press_rank = 0
    press_release_rank = 0
    recieved_btes_rank = 0
    sent_bytes_rank = 0
    mouse_speed_rank = 0

    rank_list = []

    for i in range(len(dataframe)):

        total_rank = 0

        keystoke_rank = vulnerability_ranking(dataframe['keystroke_counter'][i],dataframe['keystroke_counter'])
        assigned_value = factor.get('keystroke_counter', 0)
        total_rank += (keystoke_rank * assigned_value)

        erase_rank = vulnerability_ranking(dataframe['erase_keys_counter'][i],dataframe['erase_keys_counter'])
        assigned_value = factor.get('erase_keys_counter', 0)
        total_rank += (erase_rank * assigned_value)

        app_switch_rank = vulnerability_ranking(dataframe['changes_between_apps'][i],dataframe['changes_between_apps'])
        assigned_value = factor.get('changes_between_apps', 0)
        total_rank += (app_switch_rank * assigned_value)

        press_press_rank = vulnerability_ranking(dataframe['press_press_average_interval'][i],dataframe['press_press_average_interval'])
        assigned_value = factor.get('press_press_average_interval', 0)
        total_rank += (press_press_rank* assigned_value)

        press_release_rank = vulnerability_ranking(dataframe['press_release_average_interval'][i],dataframe['press_release_average_interval'])
        assigned_value = factor.get('press_release_average_interval', 0)
        total_rank += (press_release_rank * assigned_value)

        recieved_btes_rank = vulnerability_ranking(dataframe['received_bytes'][i],dataframe['received_bytes'])
        assigned_value = factor.get('received_bytes', 0)
        total_rank += (recieved_btes_rank * assigned_value)

        sent_bytes_rank = vulnerability_ranking(dataframe['sent_bytes'][i],dataframe['sent_bytes'])
        assigned_value = factor.get('sent_bytes', 0)
        total_rank += (sent_bytes_rank * assigned_value)

        mouse_speed_rank = vulnerability_ranking(dataframe['mouse_average_movement_speed'][i],dataframe['mouse_average_movement_speed'])
        assigned_value = factor.get('mouse_average_movement_speed', 0)
        total_rank += (mouse_speed_rank * assigned_value)

        rank_list.append(total_rank)

    return rank_list



# I want to have 4 quaters to divide each entry:
    legit
    low
    medium
    high
    Critical

In [8]:
# we want to generate fake data for the project
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

behacom_dir = "./Behacom"

# Loop through the user folders
for i in range(0, 11):  # User folders are named user0, user2, ..., user11
    df = pd.DataFrame()
    user_folder = f"user{i}"
    user_folder_path = os.path.join(behacom_dir, user_folder)

    # Loop through the files in the user folder
    for file in os.listdir(user_folder_path):
        if file.endswith(".csv"):
            file_path = os.path.join(user_folder_path, file)

            # Read the CSV file and append it to the combined_data DataFrame
            df = pd.read_csv("./Behacom/User0/User0_BEHACOM.csv", encoding='latin-1')
            real_df = df[['timestamp','keystroke_counter', 'erase_keys_counter', 'changes_between_apps',
                    'press_press_average_interval', 'press_release_average_interval',
                    'received_bytes', 'sent_bytes', 'mouse_average_movement_speed','USER']]

            df2 = df.sample(n=300)
            fake_df = df2[['timestamp','keystroke_counter', 'erase_keys_counter', 'changes_between_apps',
                    'press_press_average_interval', 'press_release_average_interval',
                    'received_bytes', 'sent_bytes', 'mouse_average_movement_speed','USER']]

            fake_df.reset_index(drop=True, inplace=True)

            #this part is for my manual part 
            classes_fake = analysis(fake_df)

            classes_real = analysis(real_df)

            # Create a linear regression model
            regressor = LinearRegression()

            # Fit the model to the training data
            regressor.fit(real_df, classes_real)

            # Predict the labels for the testing data
            y_pred = regressor.predict(fake_df)

            crit = criticality(y_pred)

            fake_df.insert(1,"Classification",crit,True)

            # for i in range(len(crit)):
            #     if crit[i] == 'medium':
            #         print(fake_df.loc[i])

            # Calculate mean squared error
            mse = mean_squared_error(classes_fake, y_pred)
            # print("Mean Squared Error:", mse)

            for cnt in range(len(crit)):
                            if crit[cnt] == 'high':
                                row = fake_df.iloc[cnt]
                                with open("high_potential_events.txt", "w") as high:
                                    # Append content to the file
                                    high.write("Action required immediate attention: \n")
                                    high.write("\tUser, application, received_bytes, sent_bytes")
                                    high.write(row['timestamp'] + row['USER'] + str((row['received_bytes']/1024/1024)) + str((row['sent_bytes']/1024/1024)) )

                            if crit[cnt] == 'critical':  
                                row = fake_df.iloc[cnt]
                                with open("critical_events.txt", "w") as criti:
                                    # Append content to the file
                                    criti.write("Action had been blocked: \n")
                                    criti.write("\tUser, application, received_bytes, sent_bytes")
                                    criti.write(row['timestamp'] + row['USER'] + str((row['received_bytes']/1024/1024)) + str((row['sent_bytes']/1024/1024)) )

            # Define column order
            column_order = ['Classification','sent_bytes', 'keystroke_counter', 'erase_keys_counter', 'received_bytes',
                            'mouse_average_movement_speed', 'changes_between_apps', 'press_release_average_interval', 'press_press_average_interval','USER']

            # Write selected columns to CSV
            fake_df[column_order].to_csv(f"user{i}"+'.csv', index=False)



UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('float64'), dtype('<U18')) -> None