In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the preprocessed data from a CSV file
df = pd.read_csv('preprocesssed_data.csv')

# Ensure your CSV file has at least 'text' and 'label' columns
if 'text' not in df.columns or 'label' not in df.columns:
    raise ValueError("The input CSV file must contain 'text' and 'label' columns.")

# Split the data into training (70%) and remaining (30%)
train_data, remaining_data = train_test_split(df, train_size=0.7, random_state=42, stratify=df['label'])

# Split the remaining data into testing (20% of total) and validation (10% of total)
test_data, val_data = train_test_split(remaining_data, test_size=1/3, random_state=42, stratify=remaining_data['label'])

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
X_test = tfidf_vectorizer.transform(test_data['text'])
X_val = tfidf_vectorizer.transform(val_data['text'])

# Get the labels
y_train = train_data['label']
y_test = test_data['label']
y_val = val_data['label']

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42)

# Define the hyperparameters and their values for grid search
param_grid = {
    'C': [0.1, 1.0, 10.0],
    'solver': ['liblinear', 'lbfgs']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=log_reg_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best estimator
best_log_reg_model = grid_search.best_estimator_

# Predict on the test data
y_test_pred = best_log_reg_model.predict(X_test)

# Evaluate the model on the test data
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)

# Print the performance metrics for the test data
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Test Accuracy: {test_accuracy:.2f}')
print('Test Classification Report:')
print(test_report)
print('Test Confusion Matrix:')
print(test_conf_matrix)

# Predict on the validation data
y_val_pred = best_log_reg_model.predict(X_val)

# Evaluate the model on the validation data
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)

# Print the performance metrics for the validation data
print(f'\nValidation Accuracy: {val_accuracy:.2f}')
print('Validation Classification Report:')
print(val_report)
print('Validation Confusion Matrix:')
print(val_conf_matrix)

# Evaluate on the training data to check for overfitting
y_train_pred = best_log_reg_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_report = classification_report(y_train, y_train_pred)
train_conf_matrix = confusion_matrix(y_train, y_train_pred)

# Print the performance metrics for the training data
print(f'\nTraining Accuracy: {train_accuracy:.2f}')
print('Training Classification Report:')
print(train_report)
print('Training Confusion Matrix:')
print(train_conf_matrix)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Parameters: {'C': 10.0, 'solver': 'lbfgs'}
Test Accuracy: 0.98
Test Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3403
           1       0.96      0.62      0.75       173

    accuracy                           0.98      3576
   macro avg       0.97      0.81      0.87      3576
weighted avg       0.98      0.98      0.98      3576

Test Confusion Matrix:
[[3398    5]
 [  66  107]]

Validation Accuracy: 0.98
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1701
           1       0.94      0.59      0.72        87

    accuracy                           0.98      1788
   macro avg       0.96      0.79      0.86      1788
weighted avg       0.98      0.98      0.98      1788

Validation Confusion Matrix:
[[1698    3]
 [  36   51]]

Training Accuracy: 0.9

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix

# Load the preprocessed CSV file
df = pd.read_csv('preprocesssed_data.csv')

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Convert the labels to numerical values
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# Convert text data to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to have the same length
max_length = 200
padded_train = pad_sequences(X_train, maxlen=max_length)
padded_val = pad_sequences(X_val, maxlen=max_length)
padded_test = pad_sequences(X_test, maxlen=max_length)

# Define the LSTM model
def lstm_model(lstm_units, dropout, batch_size):
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128))
    model.add(LSTM(lstm_units, dropout=dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Define the hyperparameters for grid search
lstm_hyperparameters = {
    'lstm_units': [32, 64],
    'dropout': [0.1, 0.2],
    'batch_size': [32, 64]
}

# Perform grid search for LSTM model
best_lstm_model = None
best_lstm_accuracy = 0
for lstm_units in lstm_hyperparameters['lstm_units']:
    for dropout in lstm_hyperparameters['dropout']:
        for batch_size in lstm_hyperparameters['batch_size']:
            model = lstm_model(lstm_units, dropout, batch_size)
            model.fit(padded_train, y_train, epochs=5, batch_size=batch_size, validation_data=(padded_val, y_val), verbose=0)
            train_accuracy = model.evaluate(padded_train, y_train)[1]
            val_accuracy = model.evaluate(padded_val, y_val)[1]
            if val_accuracy > best_lstm_accuracy:
                best_lstm_model = model
                best_lstm_accuracy = val_accuracy

# Generate classification report for the best LSTM model
y_pred_lstm = best_lstm_model.predict(padded_test)
y_pred_lstm = (y_pred_lstm > 0.5).astype(int)
print("LSTM Model Classification Report:")
print(classification_report(y_test, y_pred_lstm, target_names=['fake', 'real']))
print("Training Accuracy:", train_accuracy)
print("Validation Accuracy:", val_accuracy)
print("Test Accuracy:", best_lstm_model.evaluate(padded_test, y_test)[1])
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lstm))

[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 46ms/step - accuracy: 0.9979 - loss: 0.0115
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - accuracy: 0.9659 - loss: 0.1202
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 43ms/step - accuracy: 0.9977 - loss: 0.0088
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 44ms/step - accuracy: 0.9805 - loss: 0.0931
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 41ms/step - accuracy: 0.9960 - loss: 0.0151
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step - accuracy: 0.9828 - loss: 0.0921
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 50ms/step - accuracy: 0.9956 - loss: 0.0153
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 62ms/step - accuracy: 0.9792 - loss: 0.0955
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 52ms/step - accuracy: 0.9984 - loss: 0.0060
[1m90/90[0m

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the preprocessed CSV file
df = pd.read_csv('preprocesssed_data.csv')

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Convert the text data to numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

# Define the hyperparameters for grid search
rf_hyperparameters = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15]
}

# Perform grid search for Random Forest model
best_rf_model = None
best_rf_accuracy = 0
for n_estimators in rf_hyperparameters['n_estimators']:
    for max_depth in rf_hyperparameters['max_depth']:
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train_vectorized, y_train)
        train_accuracy = model.score(X_train_vectorized, y_train)
        val_accuracy = model.score(X_val_vectorized, y_val)
        if val_accuracy > best_rf_accuracy:
            best_rf_model = model
            best_rf_accuracy = val_accuracy

# Generate classification report for the best Random Forest model
y_pred_rf = best_rf_model.predict(X_test_vectorized)
print("Random Forest Model Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['fake', 'real']))
print("Training Accuracy:", train_accuracy)
print("Validation Accuracy:", val_accuracy)
print("Test Accuracy:", best_rf_model.score(X_test_vectorized, y_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Model Classification Report:
              precision    recall  f1-score   support

        fake       0.96      1.00      0.98      3394
        real       1.00      0.14      0.25       182

    accuracy                           0.96      3576
   macro avg       0.98      0.57      0.61      3576
weighted avg       0.96      0.96      0.94      3576

Training Accuracy: 0.9590106624715958
Validation Accuracy: 0.9542118140510311
Test Accuracy: 0.9563758389261745
Confusion Matrix:
[[3394    0]
 [ 156   26]]
