In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential #type: ignore
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Attention #type: ignore
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau #type: ignore
from tensorflow.keras.regularizers import l2 #type: ignore
from scipy.stats import entropy
import webbrowser

In [7]:
df=pd.read_csv(r"C:\Users\Padmajaa\OneDrive - SSN Trust\INTEL CBE\extended_keystroke_data.csv")

In [8]:
print("Dataset shape:", df.shape)
print("\nFeature summary:")
print(df.describe())

print("\nCorrelation with target:")
df['target'] = (df['target'] == 'bot').astype(int)


Dataset shape: (30000, 17)

Feature summary:
       avg_keystroke_time  std_keystroke_time  avg_pause_time  std_pause_time  \
count        30000.000000        30000.000000    30000.000000    30000.000000   
mean             0.125015            0.029564        0.299886        0.123109   
std              0.075180            0.020050        0.200939        0.075384   
min              0.044156            0.005124        0.068358        0.023456   
25%              0.050007            0.009859        0.100043        0.049234   
50%              0.108581            0.020703        0.246610        0.091680   
75%              0.199969            0.049281        0.499591        0.196799   
max              0.232398            0.076009        0.651534        0.300335   

       avg_key_hold_time  std_key_hold_time  typing_speed  rhythm_consistency  \
count       30000.000000       30000.000000  30000.000000        3.000000e+04   
mean            0.074991           0.019689      6.026807      

  sqr = _ensure_numeric((avg - values) ** 2)


In [9]:
columns = [
    'avg_keystroke_time', 'std_keystroke_time',
    'avg_pause_time', 'std_pause_time',
    'avg_key_hold_time', 'std_key_hold_time',
    'typing_speed', 'rhythm_consistency',
    'avg_key_distance', 'std_key_distance',
    'error_rate', 'correction_rate',
    'copy_paste_frequency',
    'mouse_speed', 'mouse_acceleration', 'mouse_jerk'
]

In [10]:
def clean_data(df):
    df = df.replace([np.inf, -np.inf], np.nan)

    problematic_columns = df.columns[df.isin([np.inf, -np.inf, np.nan]).any()].tolist()

    print("Columns with NaN or infinite values:")
    for col in problematic_columns:
        nan_count = df[col].isna().sum()
        inf_count = np.isinf(df[col]).sum()
        print(f"{col}: NaN count = {nan_count}, Inf count = {inf_count}")

    for col in problematic_columns:
        median_value = df[col].median()
        df[col] = df[col].replace([np.inf, -np.inf, np.nan], median_value)

    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            lower_bound = df[column].quantile(0.001)
            upper_bound = df[column].quantile(0.999)
            df[column] = df[column].clip(lower_bound, upper_bound)

    return df

In [11]:
df = clean_data(df)
df['target'] = (df['target'] == 'bot').astype(int)

X = df.drop('target', axis=1).values
y = df['target'].values

Columns with NaN or infinite values:
rhythm_consistency: NaN count = 20, Inf count = 0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

In [28]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, BatchNormalization, Dropout
from tensorflow.keras.models import Model

def create_rag_model(vocab_size, max_input_length, embedding_dim, context_size):
    # Input for the query
    query_input = Input(shape=(max_input_length,), name='query_input')

    # Input for the context (retrieved documents)
    context_input = Input(shape=(context_size, max_input_length), name='context_input')

    # Embedding layer for the query
    query_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(query_input)

    # LSTM for processing the query
    query_lstm = LSTM(128, return_sequences=True)(query_embedding)
    query_lstm = BatchNormalization()(query_lstm)
    query_lstm = Dropout(0.3)(query_lstm)

    # Attention mechanism between the query and context
    attention_out = Attention()([query_lstm, context_input])

    # LSTM for processing the attention output
    context_lstm = LSTM(64)(attention_out)
    context_lstm = BatchNormalization()(context_lstm)
    context_lstm = Dropout(0.3)(context_lstm)

    # Combine context LSTM output with the query
    combined = tf.concat([context_lstm, query_lstm[:, -1, :]], axis=-1)

    # Dense layer for generating the output
    dense_out = Dense(32, activation='relu')(combined)
    dense_out = BatchNormalization()(dense_out)
    dense_out = Dropout(0.3)(dense_out)

    # Final output layer
    outputs = Dense(1, activation='sigmoid')(dense_out)

    # Create the model
    model = Model(inputs=[query_input, context_input], outputs=outputs)

    return model

# Example usage
vocab_size = 10000  # Example vocabulary size
max_input_length = 50  # Maximum length of input queries
embedding_dim = 128  # Dimension of embeddings
context_size = 10  # Number of retrieved documents (contexts)

rag_model = create_rag_model(vocab_size, max_input_length, embedding_dim, context_size)
rag_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rag_model.summary()


ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```


In [15]:
model = create_rag_model((1, X_train_reshaped.shape[2]))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [16]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

In [18]:
history = model.fit(X_train_reshaped, y_train,
                    validation_split=0.2,
                    epochs=100,
                    batch_size=64,
                    callbacks=[early_stopping, reduce_lr])

Epoch 1/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9999 - loss: 0.0106 - val_accuracy: 1.0000 - val_loss: 0.0035 - learning_rate: 0.0010
Epoch 2/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0029 - val_accuracy: 1.0000 - val_loss: 0.0014 - learning_rate: 0.0010
Epoch 3/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0016 - val_accuracy: 1.0000 - val_loss: 8.8125e-04 - learning_rate: 0.0010
Epoch 4/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 6.1287e-04 - learning_rate: 0.0010
Epoch 5/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 8.3896e-04 - val_accuracy: 1.0000 - val_loss: 4.4643e-04 - learning_rate: 0.0010
Epoch 6/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━

In [29]:
model.save('rag_model.h5')



In [19]:
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")



[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 1.0000 - loss: 2.5999e-10
Test Loss: 2.5999116748387507e-10, Test Accuracy: 1.0


In [20]:
def predict_bot(new_data):
    if new_data.ndim == 1:
        new_data = new_data.reshape(1, -1)

    new_data_scaled = scaler.transform(new_data)

    new_data_reshaped = new_data_scaled.reshape((new_data_scaled.shape[0], 1, new_data_scaled.shape[1]))

    prediction = model.predict(new_data_reshaped)

    is_bot = prediction > 0.5
    confidence = prediction if is_bot else 1 - prediction

    return is_bot[0][0], confidence[0][0]

real_input = np.array([0.2, 0.05, 0.5, 0.1, 0.1, 0.02, 5.0, 0.8, 2.0, 0.5, 0.01, 0.005, 0.001, 300, 100, 50])
is_bot, confidence = predict_bot(real_input)
print(f"Is bot: {is_bot}, Confidence: {confidence}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 612ms/step
Is bot: False, Confidence: 0.9999710321426392


In [27]:
import numpy as np
import pandas as pd

# Assuming predict_bot function is defined elsewhere
def predict_bot(features):
    # Replace this function with your actual model prediction logic
    # Example mock implementation
    is_bot = np.random.choice([True, False])  # Randomly deciding for the example
    confidence = np.random.rand()  # Random confidence score for the example
    return is_bot, confidence

# Function to read features from a CSV file
def read_features_from_csv(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Assuming the CSV has a single row with the features in order
    features = df.iloc[0].values  # Get the first row as a numpy array

    return features

# Input CSV file path
csv_file_path = (r'C:\Users\Padmajaa\OneDrive - SSN Trust\INTEL CBE\features.csv' ) # Update this to your actual file path

# Read features from CSV
real_input = read_features_from_csv(csv_file_path)

# Ensure the features are in the correct format
real_input = np.array(real_input)

# Predict if it's a bot and get confidence
is_bot, confidence = predict_bot(real_input)

# Print the results
print(f"Is bot: {is_bot}, Confidence: {confidence:.2f}")


Is bot: False, Confidence: 0.31
