In [2]:
import pandas as pd
import numpy as np

# Read the CSV file
df1 = pd.read_csv('normalized_dataset.csv')

# Get the number of columns
num_columns = len(df1.columns)

print("Number of columns in the CSV file:", num_columns)

# Read the CSV file
df2 = pd.read_csv('large-mergedDataset.csv')

# Get the number of columns
num_columns = len(df2.columns)

print("Number of columns in the CSV file:", num_columns)

Number of columns in the CSV file: 186
Number of columns in the CSV file: 15


In [3]:

# Merge the DataFrames based on 'id' column
merged_df = pd.merge(df1, df2, on='id')

# Print the shape of the merged DataFrame to confirm the merge was successful
print("Shape of merged DataFrame:", merged_df.shape)

merged_df.to_csv('merged_dataset.csv', index=False)



Shape of merged DataFrame: (19538, 200)


In [4]:
# Read the CSV file
df3 = pd.read_csv('merged_dataset.csv')

# Get the number of columns
num_columns = len(df3.columns)

print("Number of columns in the CSV file:", num_columns)

Number of columns in the CSV file: 200


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors

# Function to preprocess text
def preprocess_text(text):
    # Check if the text is NaN (handles missing values)
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()

        # Remove URLs and specific patterns (if any)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'www\S+', '', text)

        # Remove punctuation and non-word characters
        text = re.sub(r'[^\w\s]', '', text)

        # Tokenize the text
        tokens = word_tokenize(text)

        # Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

        # Lemmatize the words
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Join tokens back to a cleaned text
        cleaned_text = ' '.join(tokens)
    else:
        cleaned_text = ''  # Replace NaN with an empty string
    return cleaned_text

# Preprocess 'targetTitle' column and update it in the DataFrame
df3['targetTitle'] = df3['targetTitle'].apply(preprocess_text)

# Save the updated DataFrame to a new CSV file
df3.to_csv('preprocess_merged.csv', index=False)

# Print the shape of the preprocessed DataFrame
print("Shape of preprocessed DataFrame:", df3.shape)


Shape of preprocessed DataFrame: (19538, 200)


In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors

# Load the DataFrame with the preprocessed 'targetTitle' column
df4 = pd.read_csv('preprocess_merged.csv')

# Load GloVe word vectors
glove_path = 'glove.6B.100d.txt'
glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)

# Function to get GloVe word embeddings
def get_glove_embeddings(text):
    words = text.split()
    embeddings = [glove_model[word] if word in glove_model else np.zeros(100) for word in words]
    if not embeddings:
        return np.zeros(100)  # Assign a default embedding for empty lists (out-of-vocabulary words)
    return np.mean(embeddings, axis=0)

# Apply GloVe embeddings to the preprocessed 'targetTitle' column
df4['targetTitle_glove'] = df3['targetTitle'].apply(get_glove_embeddings)

# Save the DataFrame with embeddings to a new CSV file
df4.to_csv('preprocess_merged_with_glove.csv', index=False)

# Print the shape of the DataFrame with GloVe embeddings
print("Shape of DataFrame with GloVe embeddings:", df4.shape)


Shape of DataFrame with GloVe embeddings: (19538, 201)


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the preprocess_merged.csv file
df5 = pd.read_csv('preprocess_merged_with_glove.csv')

# Separate features (X) and class labels (Y)
X = df5["targetTitle_glove"]
Y = df5["truthClass"]
ids = df5["id"]

from keras.utils.np_utils import to_categorical
classes_list = ["no-clickbait","clickbait"]
label_index = Y.apply(classes_list.index)
label1 = np.asarray(label_index)
label = to_categorical(np.asarray(label1))
y=label1
y.shape
y

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [8]:


# Perform train-test split on X, Y, and ids
X_temp, X_test, y_temp, y_test, ids_temp, ids_test = train_test_split(X, Y, ids, test_size=0.2, random_state=42)
X_dev, X_train, y_dev, y_train, ids_dev, ids_train = train_test_split(X_temp, y_temp, ids_temp, test_size=0.5, random_state=42)

# Create DataFrames for X_train, X_dev, and X_test including 'id'
X_train_df = df5[df5["id"].isin(ids_train)]
X_dev_df = df5[df5["id"].isin(ids_dev)]
X_test_df = df5[df5["id"].isin(ids_test)]

# Create DataFrames for y_train, y_dev, and y_test including 'id'
y_train_df = df5[df5["id"].isin(ids_train)]
y_dev_df = df5[df5["id"].isin(ids_dev)]
y_test_df = df5[df5["id"].isin(ids_test)]

# Save DataFrames to CSV files including 'id'
X_train_df.to_csv('X_train.csv', index=False)
X_dev_df.to_csv('X_dev.csv', index=False)
X_test_df.to_csv('X_test.csv', index=False)

y_train_df.to_csv('y_train.csv', index=False)
y_dev_df.to_csv('y_dev.csv', index=False)
y_test_df.to_csv('y_test.csv', index=False)


In [9]:


# Load the preprocessed DataFrame with GloVe embeddings
X_dev_df = pd.read_csv('X_dev.csv')

# Extract the 'targetTitle_glove' column as a numpy array
targetTitle_glove_values = np.array(X_dev_df['targetTitle_glove'].apply(lambda x: np.fromstring(x[1:-1], sep=' ')).tolist())

# Get the value of the first row of 'targetTitle_glove'
first_row_value = targetTitle_glove_values[1]

print("Value of the first row of 'targetTitle_glove':")
print(first_row_value.shape)


Value of the first row of 'targetTitle_glove':
(100,)


In [10]:
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Dense
from sklearn.model_selection import train_test_split

# Load the preprocessed DataFrame with GloVe embeddings
df5 = pd.read_csv('preprocess_merged_with_glove.csv')

# Load the preprocessed 'X_dev' DataFrame with 'targetTitle_glove' column
X_dev_df = pd.read_csv('X_dev.csv')
X_dev_glove = np.array(X_dev_df['targetTitle_glove'].apply(lambda x: np.fromstring(x[1:-1], sep=' ')).tolist())


# Assuming 'targetTitle_glove' has a size of 100 (embedding dimension)
input_dim = 100

# Define the autoencoder architecture
def build_autoencoder(input_dim, bottleneck_dim):
    # Encoder
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(100, activation='relu')(input_layer)
    encoded = Dense(75, activation='relu')(encoded)
    bottleneck = Dense(bottleneck_dim, activation='relu')(encoded)

    # Decoder
    decoded = Dense(75, activation='relu')(bottleneck)
    decoded = Dense(100, activation='relu')(decoded)
    output_layer = Dense(input_dim, activation='linear')(decoded)

    # Autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')

    return autoencoder

# Build the autoencoder
bottleneck_dim = 50
autoencoder = build_autoencoder(input_dim, bottleneck_dim)

# Train the autoencoder
autoencoder.fit(X_dev_glove, X_dev_glove, epochs=50, batch_size=32, validation_split=0.1)

# Display the summary of the autoencoder model
autoencoder.summary()



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                

In [11]:

# Load the trained autoencoder model (after the training step)
autoencoder = build_autoencoder(input_dim, bottleneck_dim)

# Get the encoder part of the autoencoder model
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[-3].output)

# Obtain the encoded representations of X_dev
encoded_X_dev = encoder.predict(X_dev_glove)

print("Encoded representations (compressed embeddings) of X_dev:")
print(encoded_X_dev)


Encoded representations (compressed embeddings) of X_dev:
[[0.08853386 0.11070979 0.         ... 0.02190581 0.00929128 0.        ]
 [0.03209248 0.01915034 0.0448814  ... 0.         0.03933427 0.        ]
 [0.         0.08483752 0.         ... 0.         0.0275916  0.        ]
 ...
 [0.02287939 0.11505627 0.         ... 0.04448852 0.05177414 0.        ]
 [0.11031618 0.00932035 0.         ... 0.         0.         0.        ]
 [0.11285195 0.         0.01888596 ... 0.         0.         0.03435314]]


In [12]:

import json


# Save the autoencoder weights to a JSON file
autoencoder.save_weights('autoencoder_weights.h5')
with open('autoencoder_summary.json', 'w') as f:
    json.dump(autoencoder.summary(), f)


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100)]             0         
                                                                 
 dense_6 (Dense)             (None, 100)               10100     
                                                                 
 dense_7 (Dense)             (None, 75)                7575      
                                                                 
 dense_8 (Dense)             (None, 50)                3800      
                                                                 
 dense_9 (Dense)             (None, 75)                3825      
                                                                 
 dense_10 (Dense)            (None, 100)               7600      
                                                                 
 dense_11 (Dense)            (None, 100)               1010

In [13]:


# Load the preprocessed DataFrame with GloVe embeddings and the corresponding IDs
X_train_df = pd.read_csv('X_train.csv')
X_test_df = pd.read_csv('X_test.csv')

# Extract the 'id' column from the DataFrames
X_train_ids = X_train_df['id']
X_test_ids = X_test_df['id']

# Convert 'targetTitle_glove' column into numpy arrays for X_train and X_test
X_train_glove = np.array(X_train_df['targetTitle_glove'].apply(lambda x: np.fromstring(x[1:-1], sep=' ')).tolist())
X_test_glove = np.array(X_test_df['targetTitle_glove'].apply(lambda x: np.fromstring(x[1:-1], sep=' ')).tolist())


# Load the trained autoencoder model (after the training step)
autoencoder = build_autoencoder(input_dim + 1, bottleneck_dim)  # Add 1 for the 'class label' column

# Get the encoder part of the autoencoder model
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[-3].output)

# Concatenate 'targetTitle_glove' with 'class label' columns for X_train and X_test
X_train_combined = np.hstack((X_train_glove, np.array(X_train_df['class label']).reshape(-1, 1)))
X_test_combined = np.hstack((X_test_glove, np.array(X_test_df['class label']).reshape(-1, 1)))

# Obtain the encoded representations of X_train_combined and X_test_combined
encoded_X_train_combined = encoder.predict(X_train_combined)
encoded_X_test_combined = encoder.predict(X_test_combined)

# Extract 50 features from the bottleneck layer for X_train_combined and X_test_combined
extracted_features_X_train = encoded_X_train_combined[:, :50]
extracted_features_X_test = encoded_X_test_combined[:, :50]

# Create DataFrames for extracted features including 'id' column
X_train_features_df = pd.DataFrame(data=np.hstack((X_train_ids.values.reshape(-1, 1), extracted_features_X_train)),
                                   columns=['id'] + [f'feature_{i+1}' for i in range(50)])

X_test_features_df = pd.DataFrame(data=np.hstack((X_test_ids.values.reshape(-1, 1), extracted_features_X_test)),
                                  columns=['id'] + [f'feature_{i+1}' for i in range(50)])

# Save the extracted features to CSV files
X_train_features_df.to_csv('X_train_features.csv', index=False)
X_test_features_df.to_csv('X_test_features.csv', index=False)




In [15]:
#Load the datasets
X_train = pd.read_csv('X_train.csv')
X_train_features = pd.read_csv('X_train_features.csv')

# Select the specified columns from X_train DataFrame
selected_columns = ['sim between postText and Title',
                    'Pt_Readability of postText',
                    'Paragraph Ratio of formal and informal word',
                    'entroy',
                    'senti_score_absolute',
                    'lexical Diversity',
                    'title variance',                                                    
                    'postText variance', 
                    'Pt_Readability of postText',
                    'Pt_POS 2-gram NNP',
                    'Pt_Number of DT',
                    'Pt_Readability of postText',
                    'Number of NNP',
                    'POS 2-gram NNP NNP',
                    'paragraph Readability',
                    'readability of title',
                    'paragraphs Number of NNP',
                    'lexical Diversity',
                    'title variance',
                    'postText variance',
                    'class label',
                    'id']
selected_features = X_train[selected_columns]

# Merge the selected features with X_train_features DataFrame based on 'id'
X_train_final = pd.merge(X_train_features, selected_features, on='id')

#Save the merged DataFrame as a new CSV file
X_train_final.to_csv('X_train_final.csv', index=False)


In [16]:
#Load the datasets
X_test = pd.read_csv('X_test.csv')
X_test_features = pd.read_csv('X_test_features.csv')

#Select the specified columns from X_test DataFrame
selected_columns = ['sim between postText and Title',
                    'Pt_Readability of postText',
                    'Paragraph Ratio of formal and informal word',
                    'entroy',
                    'senti_score_absolute',
                    'lexical Diversity',
                    'title variance',                                                    
                    'postText variance', 
                    'Pt_Readability of postText',
                    'Pt_POS 2-gram NNP',
                    'Pt_Number of DT',
                    'Pt_Readability of postText',
                    'Number of NNP',
                    'POS 2-gram NNP NNP',
                    'paragraph Readability',
                    'readability of title',
                    'paragraphs Number of NNP',
                    'lexical Diversity',
                    'title variance',
                    'postText variance',
                    'class label',
                    'id']
selected_features = X_test[selected_columns]

#Merge the selected features with X_test_features DataFrame based on 'id'
X_test_final = pd.merge(X_test_features, selected_features, on='id')

#Save the merged DataFrame as a new CSV file
X_test_final.to_csv('X_test_final.csv', index=False)


In [17]:
import pandas as pd

# Load the training and testing datasets
X_train_final = pd.read_csv('X_train_final.csv')
X_test_final = pd.read_csv('X_test_final.csv')

# Extract features (X) and labels (y)
y_train = X_train_final['class label']
y_test = X_test_final['class label']
col_drop=['id', 'class label']
X_train = X_train_final.drop(col_drop, axis=1)
X_test = X_test_final.drop(col_drop, axis=1)



In [18]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize classifiers
svm_classifier = SVC()
random_forest_classifier = RandomForestClassifier()
logistic_regression_classifier = LogisticRegression()
knn_classifier = KNeighborsClassifier()
naive_bayes_classifier = GaussianNB()
gradient_boosting_classifier = GradientBoostingClassifier()
decision_tree_classifier = DecisionTreeClassifier()
adaboost_classifier = AdaBoostClassifier()

# Train and predict with each classifier
classifiers = [
    ('SVM', svm_classifier),
    ('Random Forest', random_forest_classifier),
    ('Logistic Regression', logistic_regression_classifier),
    ('KNN', knn_classifier),
    ('Naive Bayes', naive_bayes_classifier),
    ('Gradient Boosting', gradient_boosting_classifier),
    ('Decision Tree', decision_tree_classifier),
    ('AdaBoost', adaboost_classifier)
]

for name, classifier in classifiers:
    print(f"Training {name}...")
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"{name} Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("=" * 50)


Training SVM...
SVM Accuracy: 0.8593
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1801
           1       0.86      0.51      0.64       587

    accuracy                           0.86      2388
   macro avg       0.86      0.74      0.78      2388
weighted avg       0.86      0.86      0.85      2388

SVM Confusion Matrix:
[[1753   48]
 [ 288  299]]
Training Random Forest...
Random Forest Accuracy: 0.8509
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.98      0.91      1801
           1       0.86      0.47      0.61       587

    accuracy                           0.85      2388
   macro avg       0.86      0.72      0.76      2388
weighted avg       0.85      0.85      0.83      2388

Random Forest Confusion Matrix:
[[1757   44]
 [ 312  275]]
Training Logistic Regression...
Logistic Regression Accuracy: 0.8593
Logistic Re

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN Accuracy: 0.8224
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      1801
           1       0.73      0.44      0.55       587

    accuracy                           0.82      2388
   macro avg       0.78      0.69      0.72      2388
weighted avg       0.81      0.82      0.81      2388

KNN Confusion Matrix:
[[1705   96]
 [ 328  259]]
Training Naive Bayes...
Naive Bayes Accuracy: 0.4192
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.24      0.39      1801
           1       0.29      0.95      0.45       587

    accuracy                           0.42      2388
   macro avg       0.62      0.60      0.42      2388
weighted avg       0.78      0.42      0.40      2388

Naive Bayes Confusion Matrix:
[[ 441 1360]
 [  27  560]]
Training Gradient Boosting...
Gradient Boosting Accuracy: 0.8790
Gradient Boosting Classification Report