In [None]:
!pip install transformers

In [None]:
#import necessary libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

# Display DataFrame

In [None]:
df = pd.read_csv('/content/bigdata.csv')
df = df.drop(columns = ['Unnamed: 0'], axis = 0)
df

Unnamed: 0,Original_strings,Modified_strings,Labels
0,GoQwX4sz,GoQde4sz,1
1,hGRf3pBF,hGRf3pBF,0
2,0Jqza7pZ,0Jqza7pZ,0
3,UUcV7H8c,C0cV7H8c,1
4,NvUn4yoD,NvUn4yoD,0
...,...,...,...
994,CHafU46k,CHafU46k,0
995,tEUA9rec,tEUA9rec,0
996,TkWKrM3u,TkWKrM3u,0
997,hWUWbjHZ,hWUWbjHZ,0


In [None]:
import torch
from transformers import BertTokenizer, BertModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Define function to encode text using BERT
def encode_text(text):
    
    # Tokenize text
    tokens = tokenizer.encode(text, truncation=True, max_length=512, add_special_tokens=True)
    
    # Convert tokens to PyTorch tensor
    input_ids = torch.tensor([tokens]).to(device)
    
    # Get BERT model output
    with torch.no_grad():
        outputs = model(input_ids)
        
    # Extract last hidden state
    last_hidden_state = outputs[0][:, 0, :].cpu().numpy()
    return last_hidden_state


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
length = len(df['Modified_strings'])
embeddings = [encode_text(df.iloc[i]['Modified_strings']) for i in range(length)]

In [None]:
output_tensors1 = np.squeeze(np.array(embeddings), axis=1)
output_tensors1.shape

(999, 768)

In [None]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(output_tensors1)
X_pca.shape

(999, 207)

# Data Pre-processing

In [None]:
X = df.drop(columns = ['Labels'])
Y = df['Labels']

# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

In [None]:
#Define a function to convert string to integers
def str2int(s, chars):
    i = 0
    for c in reversed(s):
        i *= len(chars)
        i += chars.index(c)
    return i

In [None]:
#Apply the function on the columns
chars = "".join(str(n) for n in range(10)) + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
actual = []
for row in X['Original_strings']:
  actual.append(str2int(str(row), chars))

changed = []
for row1 in X['Modified_strings']:
  changed.append(str2int(str(row1), chars))

In [None]:
#Display modified dataframe
numeric_df = pd.DataFrame(list(zip(actual, changed,Y)), columns = ['Original_strings','Modified_strings', 'Labels'])
numeric_df

Unnamed: 0,Original_strings,Modified_strings,Labels
0,124851461976698,124850792513346,1
1,146510759002217,146510759002217,0
2,216245066005686,216245066005686,0
3,42753287939648,42753287936158,1
4,138737388616539,138737388616539,0
...,...,...,...
994,70777589159872,70777589159872,0
995,43079455943741,43079455943741,0
996,105863223417335,105863223417335,0
997,217278484214205,217278484214205,0


# Training Models

In [None]:
#Split the dataset into test and train
X_numeric = numeric_df.drop(columns = ['Labels'])
Y_numeric = numeric_df['Labels']

x_train_numeric, x_test_numeric, y_train_numeric, y_test_numeric = train_test_split(X_pca, Y_numeric, test_size = 0.3, stratify=Y_numeric)

In [None]:
#Check label count in test data
one_count = np.count_nonzero(y_test_numeric == 1)
zero_count = np.count_nonzero(y_test_numeric == 0)

print("Count of label 1 is",one_count)
print("Count of label 0 is",zero_count)

Count of label 1 is 103
Count of label 0 is 197


# Decision Tree Classifier

In [None]:
#Base model of decision tree classifier
model = DecisionTreeClassifier()
model.fit(x_train_numeric, y_train_numeric)
y_pred = model.predict(x_test_numeric)
score = accuracy_score(y_test_numeric, y_pred)
print("The accuracy of the base decision tree is",score)

The accuracy of the base decision tree is 0.57


In [None]:
#Hyper-parameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV

params = {'max_depth' : np.arange(1,15),
              'min_samples_split' : np.arange(2,15),
              'min_samples_leaf' : np.arange(2,10),
              'criterion' :['gini', 'entropy']
             }
grid_search = GridSearchCV(estimator = DecisionTreeClassifier(random_state=42), param_grid=params, cv=5, verbose=True)
grid_search.fit(x_train_numeric, y_train_numeric)

Fitting 5 folds for each of 2912 candidates, totalling 14560 fits


In [None]:
# Optimized Decision Tree
y_pred = grid_search.predict(x_test_numeric)
score = accuracy_score(y_test_numeric, y_pred)
print("The accuracy of GridSearchCV optimized decision tree is",score)

print("\n",classification_report(y_test_numeric, y_pred))

The accuracy of GridSearchCV optimized decision tree is 0.6566666666666666

               precision    recall  f1-score   support

           0       0.66      1.00      0.79       197
           1       0.00      0.00      0.00       103

    accuracy                           0.66       300
   macro avg       0.33      0.50      0.40       300
weighted avg       0.43      0.66      0.52       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Gaussian Naive Bayes Classifier

In [None]:
#Base Gaussian NB Classifier
clf = GaussianNB()
clf.fit(x_train_numeric, y_train_numeric)
y = clf.predict(x_test_numeric)
print("Accuracy of base Gaussian NB Classifier:", accuracy_score(y,y_test_numeric))

Accuracy of base Gaussian NB Classifier: 0.5466666666666666


In [None]:
#Hyper-parameter tuning using GridSearchCV

#Defined the paramter grid
param_grid_nb = {'var_smoothing': np.logspace(0,-9, num=100)}

#Fit the model and print best estimator
nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(x_train_numeric, y_train_numeric)
print("Best Gaussian NB Model:", nbModel_grid.best_estimator_)

#Fitting 10 folds for each of 100 candidates, totalling 1000 fits
GaussianNB(priors=None, var_smoothing=1.0)

gaussian_y_pred = nbModel_grid.predict(x_test_numeric)
print("Accuracy of optimized Gaussian NB Classifier:",accuracy_score(gaussian_y_pred,y_test_numeric))

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best Gaussian NB Model: GaussianNB(var_smoothing=0.3511191734215131)
Accuracy of optimized Gaussian NB Classifier: 0.65


In [None]:
print("Classification Report for optimized Gaussian NB\n",classification_report(gaussian_y_pred,y_test_numeric))

Classification Report for optimized Gaussian NB
               precision    recall  f1-score   support

           0       0.98      0.66      0.79       294
           1       0.02      0.33      0.04         6

    accuracy                           0.65       300
   macro avg       0.50      0.49      0.41       300
weighted avg       0.96      0.65      0.77       300



# Random Forest Classifier

In [None]:
#Tuned Random Forest Classifier
rfc=RandomForestClassifier(random_state=42)

#paramter grid
param_grid = { 
    'n_estimators': [200, 500],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

#Fit the model with optimum parameters
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(x_train_numeric, y_train_numeric)

In [None]:
#Display best parameters
CV_rfc.best_params_

{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 200}

In [None]:
rtf_y_pred = CV_rfc.predict(x_test_numeric)
print("Accuracy of optimized random forest classifier",accuracy_score(y_test_numeric, rtf_y_pred))
print("\n",classification_report(y_test_numeric, rtf_y_pred))

Accuracy of optimized random forest classifier 0.6566666666666666

               precision    recall  f1-score   support

           0       0.66      1.00      0.79       197
           1       0.00      0.00      0.00       103

    accuracy                           0.66       300
   macro avg       0.33      0.50      0.40       300
weighted avg       0.43      0.66      0.52       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# XGBoost

In [None]:
#Initialize model
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)

#Fit the model
xgb.fit(x_train_numeric, y_train_numeric, early_stopping_rounds=5, eval_set=[(x_test_numeric, y_test_numeric)], verbose=False)
predictions = xgb.predict(x_test_numeric)

print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test_numeric)))
print("XGBoost accuracy",accuracy_score(y_test_numeric, predictions))



Mean Absolute Error : 0.38666666666666666
XGBoost accuracy 0.6133333333333333


In [None]:
#define parameter grid
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

#Fit the model with optimum parameters
CV_xgb = GridSearchCV(estimator=xgb, param_grid=parameters, cv= 5)
CV_xgb.fit(x_train_numeric, y_train_numeric)
pred_xgb = CV_xgb.predict(x_test_numeric)

print("Accuracy of optimized XGBoost Model", accuracy_score(pred_xgb, y_test_numeric))
print("\n", classification_report(pred_xgb, y_test_numeric))

Accuracy of optimized XGBoost Model 0.6633333333333333

               precision    recall  f1-score   support

           0       0.98      0.67      0.79       290
           1       0.06      0.60      0.11        10

    accuracy                           0.66       300
   macro avg       0.52      0.63      0.45       300
weighted avg       0.95      0.66      0.77       300



# Shallow MLP

In [None]:
from sklearn.neural_network import MLPClassifier
mod1 = MLPClassifier(hidden_layer_sizes=(32,16),validation_fraction=0.2,early_stopping=True,learning_rate_init=0.001)
mod1.fit(x_train_numeric, y_train_numeric)
pred = mod1.predict(x_test_numeric)
print("Accuracy score of MLP is", accuracy_score(pred, y_test_numeric))

Accuracy score of MLP is 0.6533333333333333


In [None]:
#Define a base estimator
est = MLPClassifier(validation_fraction=0.2,early_stopping=True)

#Define parameter grid
parameter_space = {
    'hidden_layer_sizes': [(128,64),(64,32),(32,16),(16,8)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

mlp_clf = GridSearchCV(est, parameter_space, n_jobs=-1, cv=5)
mlp_clf.fit(x_train_numeric, y_train_numeric)

#Predict labels
pred_mlp = mlp_clf.predict(x_test_numeric)

In [None]:
#Display best MLP
mlp_pred = mlp_clf.best_estimator_.predict(x_test_numeric)
acc_score = accuracy_score(y_test_numeric, mlp_pred)

print("Best MLP Model:",mlp_clf.best_estimator_)
print("The accuracy of optimized MLP is", acc_score)

print("\n",classification_report(y_test_numeric, mlp_pred))

Best MLP Model: MLPClassifier(alpha=0.05, early_stopping=True, hidden_layer_sizes=(16, 8),
              learning_rate='adaptive', validation_fraction=0.2)
The accuracy of optimized MLP is 0.6566666666666666

               precision    recall  f1-score   support

           0       0.66      1.00      0.79       197
           1       0.00      0.00      0.00       103

    accuracy                           0.66       300
   macro avg       0.33      0.50      0.40       300
weighted avg       0.43      0.66      0.52       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
