In [13]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [14]:
# Load datasets
human_data = pd.read_csv('human.csv')
bot_data = pd.read_csv('bot.csv')

In [15]:
# Function to extract features
def extract_features(data):
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data['time_diff'] = data.groupby('fieldName')['timestamp'].diff().dt.total_seconds().fillna(0)

    # Aggregate features for each field
    features = data.groupby('fieldName')['time_diff'].agg(['mean', 'std', 'min', 'max']).reset_index()
    return features.drop('fieldName', axis=1).values.flatten()

In [16]:
# Prepare the data
human_features = extract_features(human_data)
bot_features = extract_features(bot_data)

X = []
y = []

X.append(human_features)
y.append(0)  # Label for human

X.append(bot_features)
y.append(1)  # Label for bot

X = pd.DataFrame(X)
y = pd.Series(y)

In [23]:
# Descriptive statistics for human and bot features
human_stats = pd.DataFrame(human_features).describe()
bot_stats = pd.DataFrame(bot_features).describe()

print("Human Features Statistics:\n", human_stats)
print("\nBot Features Statistics:\n", bot_stats)


Human Features Statistics:
               0
count  8.000000
mean   0.095350
std    0.076398
min    0.000000
25%    0.046348
50%    0.094376
75%    0.141187
max    0.202000

Bot Features Statistics:
               0
count  8.000000
mean   0.000531
std    0.000382
min    0.000000
25%    0.000375
50%    0.000541
75%    0.000750
max    0.001000


In [6]:
# List of classifiers to compare
classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": make_pipeline(StandardScaler(), SVC()),
    "Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression()),
    "K-Nearest Neighbors": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=1))  
}

In [7]:
# Train all classifiers on the entire dataset
trained_models = {}
for name, model in classifiers.items():
    model.fit(X, y)
    trained_models[name] = model
    print(f"{name} has been trained successfully.")

Random Forest has been trained successfully.
Support Vector Machine has been trained successfully.
Logistic Regression has been trained successfully.
K-Nearest Neighbors has been trained successfully.


In [10]:
# Predict for a new dataset using all models and display the results
def classify_new_data(file_path, trained_models):
    new_data = pd.read_csv(file_path)
    new_features = extract_features(new_data)
    predictions = {}
    for name, model in trained_models.items():
        prediction = model.predict([new_features])
        predictions[name] = 'Bot' if prediction[0] == 1 else 'Human'
        print(f"The given file is classified as: {predictions[name]} using {name}")
    # return predictions

In [11]:
# Example usage
file_to_classify = 'bot_test.csv'
classify_new_data(file_to_classify, trained_models)

The given file is classified as: Bot using Random Forest
The given file is classified as: Bot using Support Vector Machine
The given file is classified as: Bot using Logistic Regression
The given file is classified as: Bot using K-Nearest Neighbors


In [12]:
file_to_classify = 'human_test.csv'
classify_new_data(file_to_classify, trained_models)

The given file is classified as: Human using Random Forest
The given file is classified as: Human using Support Vector Machine
The given file is classified as: Human using Logistic Regression
The given file is classified as: Human using K-Nearest Neighbors
