In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import faiss
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.utils import resample
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack, csr_matrix
from scipy.stats import uniform, randint


In [21]:
data = pd.read_csv('../YouTube_Datasets/features_youtube.csv')

In [22]:
data['Label'].value_counts()

Label
0    593
1     76
Name: count, dtype: int64

In [23]:
from sklearn.utils import resample

class_0 = data[data['Label'] == 0]
class_1 = data[data['Label'] == 1]

class_0_sampled = resample(class_0, n_samples=76, random_state=42)
class_1_sampled = resample(class_1, n_samples=76, random_state=42)

balanced_data = pd.concat([class_0_sampled, class_1_sampled])

data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_data['Label'].value_counts())


Label
0    76
1    76
Name: count, dtype: int64


In [24]:
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data.drop_duplicates(inplace=True)

### Validating using Bert + TFIDF Feature combination

In [25]:
def convert_embedding(embedding):
    if isinstance(embedding, np.ndarray):
        return embedding
    elif isinstance(embedding, list):
        return np.array(embedding, dtype=np.float32)
    elif isinstance(embedding, str):
        embedding = embedding.strip("[]")
        embedding = np.array([float(x) for x in embedding.split()], dtype=np.float32)
        return embedding
    else:
        return np.zeros(768, dtype=np.float32)

data['sbert_embedding'] = data['sbert_embedding'].apply(convert_embedding)

embeddings = np.stack(data['sbert_embedding'].values)

print("Embeddings Shape:", embeddings.shape)

Embeddings Shape: (118, 384)


In [26]:
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings)

In [27]:
import joblib

tfidf = joblib.load('../Models/tfidf_bert_vectorizer.joblib')
X_tfidf = tfidf.transform(data['Preprocessed_content'])

In [28]:
# smote = SMOTE(random_state=42)
# X, Y = smote.fit_resample(X, Y)

In [29]:
X = hstack([
    X_tfidf,
    embeddings,
    ])

Y = data['Label']

In [30]:
LR = joblib.load('../Models/tfidf_bert_model.joblib')

In [31]:
logistic_pred = LR.predict(X)
print("Logistic Regression Performance:")
print(classification_report(Y, logistic_pred))

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.76      0.72      0.74        72
           1       0.60      0.65      0.62        46

    accuracy                           0.69       118
   macro avg       0.68      0.69      0.68       118
weighted avg       0.70      0.69      0.70       118



### Validating using only TFIDF Feature input

In [32]:
import joblib

tfidf = joblib.load('../Models/only_tfidf_vectorizer.joblib')
X_tfidf = tfidf.transform(data['Preprocessed_content'])

In [33]:
# smote = SMOTE(random_state=42)
# X, Y = smote.fit_resample(X, Y)

In [34]:
X = X_tfidf

Y = data['Label']

In [35]:
LR = joblib.load('../Models/only_tfidf_model.joblib')

In [36]:
logistic_pred = LR.predict(X)
print("Logistic Regression Performance:")
print(classification_report(Y, logistic_pred))

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.55      0.15      0.24        72
           1       0.38      0.80      0.51        46

    accuracy                           0.41       118
   macro avg       0.46      0.48      0.38       118
weighted avg       0.48      0.41      0.35       118



### Validating using only Bert embeddings Feature input

In [37]:
def convert_embedding(embedding):
    if isinstance(embedding, np.ndarray):
        return embedding
    elif isinstance(embedding, list):
        return np.array(embedding, dtype=np.float32)
    elif isinstance(embedding, str):
        embedding = embedding.strip("[]")
        embedding = np.array([float(x) for x in embedding.split()], dtype=np.float32)
        return embedding
    else:
        return np.zeros(768, dtype=np.float32)

data['sbert_embedding'] = data['sbert_embedding'].apply(convert_embedding)

embeddings = np.stack(data['sbert_embedding'].values)

print("Embeddings Shape:", embeddings.shape)

Embeddings Shape: (118, 384)


In [38]:
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings)

In [42]:
X = embeddings

Y = data['Label']

In [43]:
LR = joblib.load('../Models/only_bert_model.joblib')

In [44]:
logistic_pred = LR.predict(X)
print("Logistic Regression Performance:")
print(classification_report(Y, logistic_pred))

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.78      0.72      0.75        72
           1       0.61      0.67      0.64        46

    accuracy                           0.70       118
   macro avg       0.69      0.70      0.69       118
weighted avg       0.71      0.70      0.71       118

