In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from scipy.stats import entropy
import webbrowser


In [3]:
df=pd.read_csv(r"C:\Users\Padmajaa\OneDrive - SSN Trust\INTEL CBE\extended_keystroke_data.csv")

In [4]:
print("Dataset shape:", df.shape)
print("\nFeature summary:")
print(df.describe())

print("\nCorrelation with target:")
df['target'] = (df['target'] == 'bot').astype(int)


Dataset shape: (30000, 17)

Feature summary:
       avg_keystroke_time  std_keystroke_time  avg_pause_time  std_pause_time  \
count        30000.000000        30000.000000    30000.000000    30000.000000   
mean             0.125015            0.029564        0.299886        0.123109   
std              0.075180            0.020050        0.200939        0.075384   
min              0.044156            0.005124        0.068358        0.023456   
25%              0.050007            0.009859        0.100043        0.049234   
50%              0.108581            0.020703        0.246610        0.091680   
75%              0.199969            0.049281        0.499591        0.196799   
max              0.232398            0.076009        0.651534        0.300335   

       avg_key_hold_time  std_key_hold_time  typing_speed  rhythm_consistency  \
count       30000.000000       30000.000000  30000.000000        3.000000e+04   
mean            0.074991           0.019689      6.026807      

  sqr = _ensure_numeric((avg - values) ** 2)


In [5]:
columns = [
    'avg_keystroke_time', 'std_keystroke_time',
    'avg_pause_time', 'std_pause_time',
    'avg_key_hold_time', 'std_key_hold_time',
    'typing_speed', 'rhythm_consistency',
    'avg_key_distance', 'std_key_distance',
    'error_rate', 'correction_rate',
    'copy_paste_frequency',
    'mouse_speed', 'mouse_acceleration', 'mouse_jerk'
]

In [6]:
df.head()

Unnamed: 0,avg_keystroke_time,std_keystroke_time,avg_pause_time,std_pause_time,avg_key_hold_time,std_key_hold_time,typing_speed,rhythm_consistency,avg_key_distance,std_key_distance,error_rate,correction_rate,copy_paste_frequency,mouse_speed,mouse_acceleration,mouse_jerk,target
0,0.049956,0.009726,0.101877,0.04444,0.050494,0.00822,9.81574,4.49174,1.430957,0.532054,0.0,0.0,0.10989,855.816058,84.965308,43.612724,1
1,0.051875,0.01126,0.108548,0.048142,0.048437,0.009646,9.212542,4.223511,1.481098,0.454601,0.0,0.0,0.1,902.853033,127.843407,49.040961,1
2,0.188867,0.052601,0.478537,0.198715,0.099359,0.030551,2.089704,4.523323,2.102486,1.043184,0.052083,1.0,0.010417,612.503804,178.327462,123.181083,0
3,0.048219,0.010992,0.110243,0.059738,0.047159,0.009636,9.070844,3.406915,1.461653,0.42488,0.0,0.0,0.0,754.754488,146.790163,22.57772,1
4,0.197381,0.043869,0.493432,0.240048,0.094911,0.029158,2.026622,3.233654,2.099242,0.751963,0.038462,1.0,0.038462,628.47245,272.454255,137.207429,0


In [7]:
def clean_data(df):
    # Replace infinity with NaN
    df = df.replace([np.inf, -np.inf], np.nan)

    # Identify columns with NaN or infinite values
    problematic_columns = df.columns[df.isin([np.inf, -np.inf, np.nan]).any()].tolist()

    print("Columns with NaN or infinite values:")
    for col in problematic_columns:
        nan_count = df[col].isna().sum()
        inf_count = np.isinf(df[col]).sum()
        print(f"{col}: NaN count = {nan_count}, Inf count = {inf_count}")

    # For problematic columns, replace NaN and Inf with median
    for col in problematic_columns:
        median_value = df[col].median()
        df[col] = df[col].replace([np.inf, -np.inf, np.nan], median_value)

    # Clip extremely large values
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            lower_bound = df[column].quantile(0.001)
            upper_bound = df[column].quantile(0.999)
            df[column] = df[column].clip(lower_bound, upper_bound)

    return df

In [8]:

df = clean_data(df)

# Convert target to numeric
df['target'] = (df['target'] == 'bot').astype(int)

# Prepare features and target
X = df.drop('target', axis=1).values
y = df['target'].values

Columns with NaN or infinite values:
rhythm_consistency: NaN count = 20, Inf count = 0


In [9]:
print("\nAfter cleaning:")
print("Any inf values in X:", np.any(np.isinf(X)))
print("Any nan values in X:", np.any(np.isnan(X)))

# Print data info
print("\nDataset shape after cleaning:", df.shape)
print("\nFeature summary:")
print(df.describe())


After cleaning:
Any inf values in X: False
Any nan values in X: False

Dataset shape after cleaning: (30000, 17)

Feature summary:
       avg_keystroke_time  std_keystroke_time  avg_pause_time  std_pause_time  \
count        30000.000000        30000.000000    30000.000000    30000.000000   
mean             0.125013            0.029562        0.299875        0.123102   
std              0.075176            0.020045        0.200917        0.075363   
min              0.045378            0.006540        0.077047        0.033033   
25%              0.050007            0.009859        0.100043        0.049234   
50%              0.108581            0.020703        0.246610        0.091680   
75%              0.199969            0.049281        0.499591        0.196799   
max              0.223962            0.066157        0.592264        0.267877   

       avg_key_hold_time  std_key_hold_time  typing_speed  rhythm_consistency  \
count       30000.000000       30000.000000  30000.000000

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Reshape data for LSTM input (samples, time steps, features)
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))


In [12]:
model = Sequential([
    LSTM(128, input_shape=(1, X_train_reshaped.shape[2]), return_sequences=True, kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    LSTM(64, kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

  super().__init__(**kwargs)


In [13]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)


In [14]:
history = model.fit(
    X_train_reshaped, y_train,
    epochs=100,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)


Epoch 1/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.6848 - loss: 2.0045 - val_accuracy: 1.0000 - val_loss: 0.3640 - learning_rate: 0.0010
Epoch 2/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9941 - loss: 0.2590 - val_accuracy: 1.0000 - val_loss: 0.0657 - learning_rate: 0.0010
Epoch 3/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9996 - loss: 0.0524 - val_accuracy: 1.0000 - val_loss: 0.0168 - learning_rate: 0.0010
Epoch 4/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0123 - val_accuracy: 1.0000 - val_loss: 0.0039 - learning_rate: 0.0010
Epoch 5/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0032 - val_accuracy: 1.0000 - val_loss: 0.0015 - learning_rate: 0.0010
Epoch 6/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [26]:
model.save('lstm_model.h5')



In [15]:
# Make predictions
y_pred = model.predict(X_test_reshaped)
y_pred_classes = (y_pred > 0.5).astype(int).flatten()


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step


In [16]:
def predict_bot(new_data):
    # Ensure new_data is a 2D array
    if new_data.ndim == 1:
        new_data = new_data.reshape(1, -1)

    # Scale the new data
    new_data_scaled = scaler.transform(new_data)

    # Reshape for LSTM input
    new_data_reshaped = new_data_scaled.reshape((new_data_scaled.shape[0], 1, new_data_scaled.shape[1]))

    # Make prediction
    prediction = model.predict(new_data_reshaped)

    # Interpret prediction
    is_bot = prediction > 0.5
    confidence = prediction if is_bot else 1 - prediction

    return is_bot[0][0], confidence[0][0]


**PREDICTION**


In [18]:
import numpy as np
import pandas as pd

# Assuming predict_bot function is defined elsewhere
def predict_bot(features):
    # Replace this function with your actual model prediction logic
    # Example mock implementation
    is_bot = np.random.choice([True, False])  # Randomly deciding for the example
    confidence = np.random.rand()  # Random confidence score for the example
    return is_bot, confidence

# Function to read features from a CSV file
def read_features_from_csv(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Assuming the CSV has a single row with the features in order
    features = df.iloc[0].values  # Get the first row as a numpy array

    return features

# Input CSV file path
csv_file_path = (r'C:\Users\Padmajaa\OneDrive - SSN Trust\INTEL CBE\features.csv' ) # Update this to your actual file path

# Read features from CSV
real_input = read_features_from_csv(csv_file_path)

# Ensure the features are in the correct format
real_input = np.array(real_input)

# Predict if it's a bot and get confidence
is_bot, confidence = predict_bot(real_input)

# Print the results
print(f"Is bot: {is_bot}, Confidence: {confidence:.2f}")


Is bot: False, Confidence: 0.86


In [24]:
print(f"Is bot: {is_bot}")

# Open loading.html if it's not a bot
if is_bot:
    webbrowser.open('loading.html')
else:
    webbrowser.open('Main_gh.html')

Is bot: False
