<a href="https://colab.research.google.com/github/Taniya2711/Data-Analysis-project/blob/main/COLINGExpReplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import nltk
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report, f1_score

# Install Stanza
!pip install stanza
import stanza
from nltk.corpus import brown
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Step 1: Download Required Datasets
print("Downloading datasets...")
nltk.download('brown')
nltk.download('punkt')

# Step 2: Prepare Brown Corpus
def prepare_brown_corpus():
    data = {'paragraph': [], 'label': []}

    fiction_categories = ["fiction"]
    # Correct non-fiction categories
    non_fiction_categories = ["learned", "belles_lettres", "government"]

    for category in fiction_categories + non_fiction_categories:
        category_data = brown.paras(categories=category)
        label = 1 if category in fiction_categories else 0

        for paragraph in category_data:
            sentences = [" ".join(sent) for sent in paragraph]
            paragraph_text = " ".join(sentences)

            if len(sentences) >= 5 and len(sentences) <= 6:
                data['paragraph'].append(paragraph_text)
                data['label'].append(label)

    return pd.DataFrame(data)


# Prepare Brown Corpus Data
brown_df = prepare_brown_corpus()
print("Brown Corpus prepared with", len(brown_df), "paragraphs.")

# Step 3: Preprocessing - Tokenization and Parsing using Stanza
print("Initializing Stanza pipeline...")
stanza.download('en')
nlp = stanza.Pipeline('en')

# Tokenization and Parsing Helper Function
def preprocess_paragraph(paragraph):
    doc = nlp(paragraph)
    tokens, pos_tags, dependencies = [], [], []

    for sentence in doc.sentences:
        for word in sentence.words:
            tokens.append(word.text)
            pos_tags.append(word.upos)
            dependencies.append((word.head, word.deprel))

    return {
        'tokens': tokens,
        'pos_tags': pos_tags,
        'dependencies': dependencies
    }

# Apply Preprocessing
print("Preprocessing paragraphs...")
brown_df['processed'] = brown_df['paragraph'].apply(preprocess_paragraph)

# Step 4: Feature Extraction - Adding Linguistically Motivated Features
def calculate_character_diversity(paragraph):
    characters = list(paragraph.replace(" ", ""))
    unique_chars = set(characters)
    return len(unique_chars) / len(characters)

def calculate_lexical_density(tokens, pos_tags):
    content_words = {'NOUN', 'VERB', 'ADJ', 'ADV'}
    content_count = sum(1 for pos in pos_tags if pos in content_words)
    return content_count / len(tokens) if len(tokens) > 0 else 0

def extract_features(data):
    features = []

    for _, row in data.iterrows():
        processed = row['processed']
        tokens, pos_tags = processed['tokens'], processed['pos_tags']

        char_diversity = calculate_character_diversity(row['paragraph'])
        lexical_density = calculate_lexical_density(tokens, pos_tags)

        features.append([char_diversity, lexical_density])

    return np.array(features)

print("Extracting features...")
X_features = extract_features(brown_df)
y = brown_df['label'].values

# Step 5: Train Logistic Regression with Recursive Feature Elimination (RFE)
def train_logistic_regression(X, y):
    model = LogisticRegression(solver='liblinear', penalty='l1')
    rfe = RFE(model, n_features_to_select=2)
    X_rfe = rfe.fit_transform(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Train Logistic Regression
print("Training logistic regression...")
train_logistic_regression(X_features, y)

# Step 6: CNN Model Implementation
def train_cnn(X, y):
    embedding_dim = 100

    # Tokenize the paragraphs
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(brown_df['paragraph'])
    sequences = tokenizer.texts_to_sequences(brown_df['paragraph'])

    # Pad sequences to have the same length
    max_len = max(len(seq) for seq in sequences)
    X = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)

    # Define the vocabulary size
    vocab_size = len(tokenizer.word_index) + 1

    # Generate GloVe embeddings (placeholder for pre-trained embeddings)
    embedding_matrix = np.random.rand(vocab_size, embedding_dim)

    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, weights=[embedding_matrix], trainable=True),
        Conv1D(filters=128, kernel_size=3, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'AUC'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    print("CNN Accuracy:", accuracy_score(y_test, y_pred))
    print("CNN F1 Score:", f1_score(y_test, y_pred, average='weighted'))

# Train CNN Model
print("Training CNN model...")
train_cnn(X_features, y)

# Step 7: BERT Model Implementation
def train_bert(paragraphs, labels):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    inputs = tokenizer(paragraphs.tolist(), return_tensors="tf", padding=True, truncation=True, max_length=512)
    dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), labels)).batch(8) # Reduced batch size to 8

    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    model.fit(dataset, epochs=3)

    predictions = tf.argmax(model.predict(dataset)[0], axis=1)
    print("BERT Accuracy:", accuracy_score(labels, predictions))
    print("BERT F1 Score:", f1_score(labels, predictions, average='weighted'))


Downloading datasets...


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Brown Corpus prepared with 852 paragraphs.
Initializing Stanza pipeline...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage

Preprocessing paragraphs...
Extracting features...
Training logistic regression...
Accuracy: 0.859375
F1 Score: 0.8213081991493241
Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       209
           1       1.00      0.23      0.38        47

    accuracy                           0.86       256
   macro avg       0.93      0.62      0.65       256
weighted avg       0.88      0.86      0.82       256

Training CNN model...
Epoch 1/5




[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 126ms/step - AUC: 0.5557 - accuracy: 0.6665 - loss: 0.6609 - val_AUC: 0.8027 - val_accuracy: 0.8164 - val_loss: 0.4756
Epoch 2/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - AUC: 0.5600 - accuracy: 0.7815 - loss: 0.5170 - val_AUC: 0.8745 - val_accuracy: 0.8164 - val_loss: 0.4523
Epoch 3/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - AUC: 0.6786 - accuracy: 0.7813 - loss: 0.4898 - val_AUC: 0.9144 - val_accuracy: 0.8164 - val_loss: 0.4400
Epoch 4/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - AUC: 0.7529 - accuracy: 0.7982 - loss: 0.4474 - val_AUC: 0.9291 - val_accuracy: 0.8164 - val_loss: 0.4304
Epoch 5/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - AUC: 0.7992 - accuracy: 0.8227 - loss: 0.4110 - val_AUC: 0.9383 - val_accuracy: 0.8164 - val_loss: 0.4009
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [