In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/FakeReviewDetectionProject/Dataset_Labelled_Extracted/Amazon_100f.csv")

# Display column names
print(df.columns)

Index(['Review', 'Rating', 'Cleaned_Review', 'Review_Length', 'Sentence_Count',
       'Word_Diversity', 'Stopword_Ratio', 'Punctuation_Count',
       'Extreme_Rating', 'Deviation_From_Avg', 'Tokenized_Review', 'Embedding',
       'Fake_Label'],
      dtype='object')


In [None]:
import numpy as np

def convert_embedding(embedding_str):
    try:
        # Remove brackets and split by spaces
        embedding_list = [float(x) for x in embedding_str.strip("[]").split()]

        # Ensure correct size (700 dimensions)
        if len(embedding_list) != 100:
            return np.zeros(100, dtype=np.float32)  # Return zero vector if size mismatch

        return np.array(embedding_list, dtype=np.float32)
    except ValueError:
        return np.zeros(100, dtype=np.float32)  # Default to zero vector if conversion fails

# Apply conversion
df['Embedding'] = df['Embedding'].apply(convert_embedding)

In [None]:
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download necessary data
nltk.download('vader_lexicon')

# Initialize Sentiment Analyzer
sid = SentimentIntensityAnalyzer()

# Display the updated DataFrame
df.head()


In [None]:
df.columns

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np

# Extract features and labels
X_features = df[['Review_Length', 'Sentence_Count', 'Word_Diversity', 'Stopword_Ratio',
                 'Punctuation_Count', 'Extreme_Rating', 'Deviation_From_Avg']].values

# Ensure embeddings are properly formatted as numeric arrays
X_embeddings = np.vstack(df['Embedding'].values)

# Concatenate extracted features with embeddings
X = np.hstack((X_features, X_embeddings))
y = df['Fake_Label'].values
original_ratings = df['Rating'].values

# Split into train and test before applying SMOTE
X_train, X_test, y_train, y_test, Rating_train, Rating_test = train_test_split(X, y, original_ratings, test_size=0.2, random_state=42)

# Apply SMOTE to fully balance classes (50-50)
# smote = SMOTE(sampling_strategy=1.0, random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

def classify_VETT_AL(X_train, X_test, y_train, y_test, Rating_test):
    # Handle NaN values
    X_train = np.nan_to_num(X_train)
    y_train = np.nan_to_num(y_train)
    X_test = np.nan_to_num(X_test)

    # Standardize data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train SVM model
    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Compute evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    history = {
        'accuracy': acc,
        'precision': pre,
        'recall': recall,
        'f1_score': f1
    }

    def rerating_accuracy(Rating_test, y_pred_labels):
        max_rating = 5
        adjusted_ratings = [3 if pred == 1 else Rating_test[i] for i, pred in enumerate(y_pred_labels)]
        re_acc = 1 - np.mean(np.abs(np.array(Rating_test) - np.array(adjusted_ratings)) / max_rating)
        return re_acc

    re_rating_acc = rerating_accuracy(Rating_test, y_pred)

    print(f'Accuracy: {acc:.4f}')
    print(f'Precision: {pre:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'Re-Rating Accuracy: {re_rating_acc:.4f}')

    return clf, y_pred, history


In [None]:
print(classify_VETT_AL)

<function classify_VETT_AL at 0x79fbdc69f4c0>


In [None]:
clf, y_pred, history = classify_VETT_AL(X_train, X_test, y_train, y_test, Rating_test)

Accuracy: 0.9302
Precision: 0.9295
Recall: 0.8263
F1 Score: 0.8748
Re-Rating Accuracy: 0.9108
