## Logistic Regression

In [1]:
import pandas as pd
csv_file_path = 'C:\\Users\\rp7248\\Desktop\\Csv files\\final_dataset.csv'

# Read the CSV file
df = pd.read_csv(csv_file_path)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler

# Load your dataset
# df = pd.read_csv('path_to_your_dataset.csv')  # Replace with your dataset path

# Preprocessing steps...

# Splitting the dataset into features (X) and target (y)
X = df.iloc[:, 1:-1]  # Select all columns except the first and last
y = df.iloc[:, -1]    # Select the last column as the target

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the parameters grid for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],       # The types of penalty ('l2' is ridge, 'l1' is lasso)
    'solver': ['liblinear', 'saga']  # Solvers that support both l1 and l2 penalties
}

# Create Logistic Regression model
model = LogisticRegression()

# Grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='roc_auc', verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best ROC-AUC score found: ", grid_search.best_score_)

# Using the best model
best_model = grid_search.best_estimator_

# Predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Compute ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Save FPR, TPR, and thresholds to CSV file for later plotting
roc_data = pd.DataFrame({'False Positive Rate': fpr, 'True Positive Rate': tpr, 'Thresholds': thresholds})
roc_data.to_csv('C:\\Users\\rp7248\\Desktop\\Csv files\\roc_data_LogisticRegression_best.csv', index=False)

# Generate and print the classification report
report = classification_report(y_test, best_model.predict(X_test))
print(report)

# Print AUC
print(f"AUC: {roc_auc}")


## LightGBM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler

# Load your dataset
# df = pd.read_csv('path_to_your_dataset.csv')  # Replace with your dataset path

# Preprocessing steps...

# Splitting the dataset into features (X) and target (y)
X = df.iloc[:, 1:-1]  # Select all columns except the first and last
y = df.iloc[:, -1]    # Select the last column as the target

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the parameters grid
param_grid = {
    'num_leaves': [31, 50, 70],
    'max_depth': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'lambda_l1': [0.0, 0.1, 0.5, 1.0]
}

# Create LightGBM model
model = LGBMClassifier()

# Grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='roc_auc', verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best ROC-AUC score found: ", grid_search.best_score_)

# Using the best model
best_model = grid_search.best_estimator_

# Predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Compute ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Save FPR, TPR, and thresholds to CSV file for later plotting
roc_data = pd.DataFrame({'False Positive Rate': fpr, 'True Positive Rate': tpr, 'Thresholds': thresholds})
roc_data.to_csv('C:\\Users\\rp7248\\Desktop\\Csv files\\roc_data_LightGBM_best.csv', index=False)

# Generate and print the classification report
report = classification_report(y_test, best_model.predict(X_test))
print(report)

# Print AUC


## Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load your dataset
# df = pd.read_csv('path_to_your_dataset.csv')  # Replace with your dataset path

# Preprocessing steps...

# Splitting the dataset into features (X) and target (y)
X = df.iloc[:, 1:-1]  # Select all columns except the first and last
y = df.iloc[:, -1]    # Select the last column as the target

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter grid
param_dist = {
    'n_estimators': np.arange(100, 501, 100),
    'max_depth': np.arange(3, 21),
    'min_samples_split': np.arange(2, 11),
    'bootstrap': [True, False],
    'max_features': ['auto', 'sqrt'],
}

# Random Forest Classifier model
rf = RandomForestClassifier(random_state=42)

# Randomized Search
rand_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
rand_search.fit(X_train_scaled, y_train)

# Best model from random search
best_rf = rand_search.best_estimator_

# Cross-validated accuracy
cv_accuracy = cross_val_score(best_rf, X_train_scaled, y_train, cv=5)
print(f'Cross-validated accuracy: {np.mean(cv_accuracy)}')

# Predict probabilities for ROC curve
y_pred_proba = best_rf.predict_proba(X_test_scaled)[:, 1]

# Compute ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Save FPR, TPR, and thresholds to CSV file for later plotting
roc_data = pd.DataFrame({'False Positive Rate': fpr, 'True Positive Rate': tpr, 'Thresholds': thresholds})
roc_data.to_csv('C:\\Users\\rp7248\\Desktop\\Csv files\\roc_data_RandomForest_best.csv', index=False)

# Generate and print the classification report
report = classification_report(y_test, best_rf.predict(X_test_scaled))
print(report)

# Print AUC
print(f"AUC: {roc_auc}")


## Neural Network (NN)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

data = pd.read_csv('C:\\Users\\rp7248\\Desktop\\Csv files\\final_dataset.csv')

X = data.iloc[:, 1:-1]  # Select all columns except the first and last
y = data.iloc[:, -1]    # Select the last column as the target

# Handle missing values, encode categorical variables, etc.

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape data for Conv1D: [samples, time steps, features]
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Build the Keras model (as a function)
def create_model(neurons=64, learning_rate=0.01):
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(Flatten())
    model.add(Dense(neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Adjust this according to your problem
    
    # Compile the model with the given learning rate
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model with KerasClassifier
model = KerasClassifier(build_fn=create_model, verbose=0)

# Define the grid search parameters
param_grid = {
    'neurons': [32, 64, 128],
    'batch_size': [16, 32, 64],
    'epochs': [10],
    'learning_rate': [0.001, 0.01, 0.1]
}

# Create GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)

# Fit GridSearchCV (this will take a while)
grid_result = grid.fit(X_train, y_train)

# Summarize results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Mean: {mean}, Stdev: {stdev} with: {param}")

# Use the best model to evaluate on test data
best_model = grid_result.best_estimator_.model
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, classification_report, f1_score
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

# Load data
data = pd.read_csv('C:\\Users\\rp7248\\Desktop\\Csv files\\final_dataset.csv')
X = data.iloc[:, 1:-1]  # Select all columns except the first and last
y = data.iloc[:, -1]    # Select the last column as the target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Build the Keras model
def create_model(neurons=64, learning_rate=0.01):
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(Flatten())
    model.add(Dense(neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Use the best parameters from GridSearch
best_params = {'batch_size': 64, 'epochs': 10, 'learning_rate': 0.001, 'neurons': 64}

# Recreate and train the best model with the entire training data
best_model = create_model(neurons=best_params['neurons'], learning_rate=best_params['learning_rate'])
best_model.fit(X_train, y_train, batch_size=best_params['batch_size'], epochs=best_params['epochs'], verbose=0)

# Function to find the best threshold
def find_best_threshold(model, X_val, y_val):
    y_pred_proba = model.predict(X_val).ravel()
    thresholds = np.arange(0.01, 1, 0.01)
    f1_scores = [f1_score(y_val, np.where(y_pred_proba > t, 1, 0)) for t in thresholds]
    best_idx = np.argmax(f1_scores)
    return thresholds[best_idx], f1_scores[best_idx]

# Find the best threshold
best_threshold, best_f1 = find_best_threshold(best_model, X_train, y_train)
print(f"Best Threshold: {best_threshold}, F1-Score: {best_f1}")

# Predict probabilities for the test data
y_pred_proba = best_model.predict(X_test).ravel()

# Use the best threshold found
y_pred_classes = np.where(y_pred_proba > best_threshold, 1, 0)

# Compute ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Save FPR, TPR, and thresholds to CSV file for later plotting
roc_data = pd.DataFrame({'False Positive Rate': fpr, 'True Positive Rate': tpr, 'Thresholds': thresholds})
roc_data.to_csv('C:\\Users\\rp7248\\Desktop\\Csv files\\roc_data_NN_best.csv', index=False)

# Generate and print the classification report
report = classification_report(y_test, y_pred_classes)
print(report)

# Print AUC
print(f"AUC: {roc_auc}")