# Topics In AI Final Report Code

## Lexical Feature Extraction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from textblob import TextBlob
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('hippoCorpusV2.csv')

# Feature extraction functions
def count_words(story):
    return len(story.split())

def count_sentences(story):
    return len(sent_tokenize(story))

def average_word_length(story):
    words = story.split()
    return sum(len(word) for word in words) / len(words) if words else 0

def lexical_diversity(story):
    words = story.split()
    return len(set(words)) / len(words) if words else 0

def punctuation_count(story):
    return sum(1 for char in story if char in string.punctuation)

def average_sentence_length(story):
    words = story.split()
    sentences = sent_tokenize(story)
    return len(words) / len(sentences) if sentences else 0

def count_sensory_words(story, sensory_words):
    words = word_tokenize(story)
    return sum(word.lower() in sensory_words for word in words)

def count_first_person_pronouns(story, pronouns):
    words = word_tokenize(story)
    return sum(word.lower() in pronouns for word in words)

def count_emotion_sentences(stories):
    return sum(TextBlob(sentence).sentiment.polarity != 0 for sentence in sent_tokenize(stories))

def count_dialogue_tags(story, tags):
    words = word_tokenize(story)
    tagged_words = pos_tag(words)
    return sum(tag.startswith('VB') and word.lower() in tags for word, tag in tagged_words)

def count_past_tense_verbs(story):
    words = word_tokenize(story)
    tagged_words = pos_tag(words)
    return sum(tag in ['VBD', 'VBN'] for word, tag in tagged_words)

# Function to calculate all features for a list of stories
def calculate_all_features(stories):
    features = {}
    sensory_words = ['see', 'hear', 'touch', 'taste', 'smell']
    pronouns = ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    dialogue_tags = ['say', 'ask', 'reply', 'yell', 'whisper']
    
    for feature_func in [count_words, count_sentences, average_word_length, 
                         lexical_diversity, punctuation_count, average_sentence_length]:
        features[feature_func.__name__] = [feature_func(story) for story in stories]
    
    features['sensory_word_count'] = [count_sensory_words(story, sensory_words) for story in stories]
    features['first_person_pronoun_count'] = [count_first_person_pronouns(story, pronouns) for story in stories]
    features['emotion_sentence_count'] = [count_emotion_sentences(story) for story in stories]
    features['dialogue_tag_count'] = [count_dialogue_tags(story, dialogue_tags) for story in stories]
    features['past_tense_verb_count'] = [count_past_tense_verbs(story) for story in stories]
    
    return features

# Main function to perform analyses and plot results
def main(df):
    imagined_stories = df[df['memType'] == 'imagined']['story']
    recalled_stories = df[df['memType'] == 'recalled']['story']

    all_imagined_analysis = calculate_all_features(imagined_stories)
    all_recalled_analysis = calculate_all_features(recalled_stories)

# Call the main function

df1=pd.read_csv("hippoCorpusV2.csv");
main(df1)


## Classifier model with 2 Lexical Features

In [None]:
def count_words(story):
    return len(story.split())

def count_sentences(story):
    return len(sent_tokenize(story))

In [None]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import string
import nltk

# Ensure necessary NLTK downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize GPT-2 model and tokenizer
def init_model():
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model.eval()
    return model, tokenizer

# Function to calculate probabilities
def calculate_probabilities(text, model, tokenizer, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}
    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)
    print(probabilities)
    return probabilities
    

# Function to add probabilities to DataFrame
def add_probabilities_to_df(df, index, probabilities):
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None
    return df

# Function to calculate Sequentiality scores
def calculate_sequentiality(df, history_sizes):
    for h in history_sizes:
        if h == 0: continue
        seq_key = f'Sequentiality_{h}'
        df[seq_key] = df[f'probability_history_size{h}'] - df['probability_history_size0']
    return df

# Function to extract linguistic features
def extract_linguistic_features(df):
    df['word_count'] = df['story'].apply(lambda x: len(x.split()))
    df['sentence_count'] = df['story'].apply(lambda x: len(sent_tokenize(x)))
    return df

# Function to train the classification model and validate it
def train_and_validate(X_train, X_val, y_train, y_val):
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train.fillna(0), y_train)
    y_pred = clf.predict(X_val.fillna(0))
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Main function to run the entire pipeline
def run_pipeline():
    # Load the dataset
    df = pd.read_csv("hippoCorpusV2.csv").head(3500)

    # Initialize model
    model, tokenizer = init_model()

    # Process each story and calculate probabilities and Sequentiality scores
    history_sizes = [0, 1, 2, 3, 4, 5]
    for index, row in df.iterrows():
        probabilities = calculate_probabilities(row['story'], model, tokenizer, history_sizes)
        df = add_probabilities_to_df(df, index, probabilities)
    df = calculate_sequentiality(df, history_sizes)

    # Extract linguistic features
    df = extract_linguistic_features(df)
    
    # Prepare the validation set
    feature_cols = ['word_count', 'sentence_count'] + \
                   [f'probability_history_size{i}' for i in history_sizes] + \
                   [f'Sequentiality_{i}' for i in range(1, 6)]
    X = df[feature_cols]
    y = df['memType']  # Assuming 'memType' is the column indicating recalled or imagined
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and validate the classification model
    accuracy = train_and_validate(X_train, X_val, y_train, y_val)
    


# Execute the pipeline
if __name__ == "__main__":
    run_pipeline()

In [None]:
print(f"Validation Accuracy: {accuracy}")

## Classifier model with 4 Lexical Features

In [None]:
def count_words(story):
    return len(story.split())

def count_sentences(story):
    return len(sent_tokenize(story))

def average_word_length(story):
    words = story.split()
    return sum(len(word) for word in words) / len(words) if words else 0

def punctuation_count(story):
    return sum(1 for char in story if char in string.punctuation)


In [None]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import string
import nltk

# Ensure necessary NLTK downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize GPT-2 model and tokenizer
def init_model():
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model.eval()
    return model, tokenizer

# Function to calculate probabilities
def calculate_probabilities(text, model, tokenizer, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}
    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)
    print(probabilities)
    return probabilities
    

# Function to add probabilities to DataFrame
def add_probabilities_to_df(df, index, probabilities):
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None
    return df

# Function to calculate Sequentiality scores
def calculate_sequentiality(df, history_sizes):
    for h in history_sizes:
        if h == 0: continue
        seq_key = f'Sequentiality_{h}'
        df[seq_key] = df[f'probability_history_size{h}'] - df['probability_history_size0']
    return df

# Function to extract linguistic features
def extract_linguistic_features(df):
    df['word_count'] = df['story'].apply(lambda x: len(x.split()))
    df['sentence_count'] = df['story'].apply(lambda x: len(sent_tokenize(x)))
    df['avg_word_length'] = df['story'].apply(lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0)
    df['punctuation_count'] = df['story'].apply(lambda x: sum(1 for char in x if char in string.punctuation))
    return df

# Function to train the classification model and validate it
def train_and_validate(X_train, X_val, y_train, y_val):
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train.fillna(0), y_train)
    y_pred = clf.predict(X_val.fillna(0))
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Main function to run the entire pipeline
def run_pipeline():
    # Load the dataset
    df = pd.read_csv("hippoCorpusV2.csv").head(3500)

    # Initialize model
    model, tokenizer = init_model()

    # Process each story and calculate probabilities and Sequentiality scores
    history_sizes = [0, 1, 2, 3, 4, 5]
    for index, row in df.iterrows():
        probabilities = calculate_probabilities(row['story'], model, tokenizer, history_sizes)
        df = add_probabilities_to_df(df, index, probabilities)
    df = calculate_sequentiality(df, history_sizes)

    # Extract linguistic features
    df = extract_linguistic_features(df)
    
    # Prepare the validation set
    feature_cols = ['word_count', 'sentence_count', 'avg_word_length', 'punctuation_count'] + \
                   [f'probability_history_size{i}' for i in history_sizes] + \
                   [f'Sequentiality_{i}' for i in range(1, 6)]
    X = df[feature_cols]
    y = df['memType']  # Assuming 'memType' is the column indicating recalled or imagined
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and validate the classification model
    accuracy = train_and_validate(X_train, X_val, y_train, y_val)
    


# Execute the pipeline
if __name__ == "__main__":
    run_pipeline()

In [None]:
print(f"Validation Accuracy: {accuracy}")

## Classifier model with 6 Lexical Features

In [None]:
def count_words(story):
    return len(story.split())

def count_sentences(story):
    return len(sent_tokenize(story))

def average_word_length(story):
    words = story.split()
    return sum(len(word) for word in words) / len(words) if words else 0

def lexical_diversity(story):
    words = story.split()
    return len(set(words)) / len(words) if words else 0

def punctuation_count(story):
    return sum(1 for char in story if char in string.punctuation)

def average_sentence_length(story):
    words = story.split()
    sentences = sent_tokenize(story)
    return len(words) / len(sentences) if sentences else 0

In [None]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import string
import nltk

# Ensure necessary NLTK downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize GPT-2 model and tokenizer
def init_model():
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model.eval()
    return model, tokenizer

# Function to calculate probabilities
def calculate_probabilities(text, model, tokenizer, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}
    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)
    print(probabilities)
    return probabilities
    

# Function to add probabilities to DataFrame
def add_probabilities_to_df(df, index, probabilities):
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None
    return df

# Function to calculate Sequentiality scores
def calculate_sequentiality(df, history_sizes):
    for h in history_sizes:
        if h == 0: continue
        seq_key = f'Sequentiality_{h}'
        df[seq_key] = df[f'probability_history_size{h}'] - df['probability_history_size0']
    return df

# Function to extract linguistic features
def extract_linguistic_features(df):
    df['lexical_diversity'] = df['story'].apply(lambda x: len(set(x.split())) / len(x.split()) if x.split() else 0)
    df['punctuation_count'] = df['story'].apply(lambda x: sum(1 for char in x if char in string.punctuation))
    df['avg_sentence_length'] = df['story'].apply(lambda x: np.mean([len(sentence.split()) for sentence in nltk.sent_tokenize(x)]) if nltk.sent_tokenize(x) else 0)
    df['word_count'] = df['story'].apply(lambda x: len(x.split()))
    df['sentence_count'] = df['story'].apply(lambda x: len(sent_tokenize(x)))
    df['avg_word_length'] = df['story'].apply(lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0)
    return df

# Function to train the classification model and validate it
def train_and_validate(X_train, X_val, y_train, y_val):
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train.fillna(0), y_train)
    y_pred = clf.predict(X_val.fillna(0))
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Main function to run the entire pipeline
def run_pipeline():
    # Load the dataset
    df = pd.read_csv("hippoCorpusV2.csv").head(3500)

    # Initialize model
    model, tokenizer = init_model()

    # Process each story and calculate probabilities and Sequentiality scores
    history_sizes = [0, 1, 2, 3, 4, 5]
    for index, row in df.iterrows():
        probabilities = calculate_probabilities(row['story'], model, tokenizer, history_sizes)
        df = add_probabilities_to_df(df, index, probabilities)
    df = calculate_sequentiality(df, history_sizes)

    # Extract linguistic features
    df = extract_linguistic_features(df)
    
    # Prepare the validation set
    feature_cols = ['word_count', 'sentence_count', 'avg_word_length', 'lexical_diversity', 'punctuation_count','avg_sentence_length'] + \
                   [f'probability_history_size{i}' for i in history_sizes] + \
                   [f'Sequentiality_{i}' for i in range(1, 6)]
    X = df[feature_cols]
    y = df['memType']  # Assuming 'memType' is the column indicating recalled or imagined
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and validate the classification model
    accuracy = train_and_validate(X_train, X_val, y_train, y_val)
    


# Execute the pipeline
if __name__ == "__main__":
    run_pipeline()

In [None]:
print(f"Validation Accuracy: {accuracy}")

## Classifier model with 8 Lexical Features

In [None]:
def count_words(story):
    return len(story.split())

def count_sentences(story):
    return len(sent_tokenize(story))

def average_word_length(story):
    words = story.split()
    return sum(len(word) for word in words) / len(words) if words else 0

def lexical_diversity(story):
    words = story.split()
    return len(set(words)) / len(words) if words else 0

def punctuation_count(story):
    return sum(1 for char in story if char in string.punctuation)

def average_sentence_length(story):
    words = story.split()
    sentences = sent_tokenize(story)
    return len(words) / len(sentences) if sentences else 0

def count_sensory_words(story, sensory_words):
    words = word_tokenize(story)
    return sum(word.lower() in sensory_words for word in words)

def count_first_person_pronouns(story, pronouns):
    words = word_tokenize(story)
    return sum(word.lower() in pronouns for word in words)

In [None]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import string
import nltk

# Ensure necessary NLTK downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize GPT-2 model and tokenizer
def init_model():
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model.eval()
    return model, tokenizer

# Function to calculate probabilities
def calculate_probabilities(text, model, tokenizer, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}
    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)
    print(probabilities)
    return probabilities
    

# Function to add probabilities to DataFrame
def add_probabilities_to_df(df, index, probabilities):
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None
    return df

# Function to calculate Sequentiality scores
def calculate_sequentiality(df, history_sizes):
    for h in history_sizes:
        if h == 0: continue
        seq_key = f'Sequentiality_{h}'
        df[seq_key] = df[f'probability_history_size{h}'] - df['probability_history_size0']
    return df

# Function to extract linguistic features
def extract_linguistic_features(df):
    df['word_count'] = df['story'].apply(lambda x: len(x.split()))
    df['sentence_count'] = df['story'].apply(lambda x: len(sent_tokenize(x)))
    df['avg_word_length'] = df['story'].apply(lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0)
    df['lexical_diversity'] = df['story'].apply(lambda x: len(set(x.split())) / len(x.split()) if x.split() else 0)
    df['punctuation_count'] = df['story'].apply(lambda x: sum(1 for char in x if char in string.punctuation))
    df['sensory_word_count'] = df['story'].apply(lambda x: sum(word in {'see', 'hear', 'touch', 'taste', 'smell', 'sight', 'sound', 'texture', 'aroma', 'flavor'} for word in x.split()))
    df['first_person_pronoun_count'] = df['story'].apply(lambda x: sum(word.lower() in {'i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'} for word in x.split()))
    df['past_tense_verb_count'] = df['story'].apply(lambda x: sum(tag.startswith('VBD') for word, tag in nltk.pos_tag(nltk.word_tokenize(x))))
    return df

# Function to train the classification model and validate it
def train_and_validate(X_train, X_val, y_train, y_val):
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train.fillna(0), y_train)
    y_pred = clf.predict(X_val.fillna(0))
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Main function to run the entire pipeline
def run_pipeline():
    # Load the dataset
    df = pd.read_csv("hippoCorpusV2.csv").head(3500)

    # Initialize model
    model, tokenizer = init_model()

    # Process each story and calculate probabilities and Sequentiality scores
    history_sizes = [0, 1, 2, 3, 4, 5]
    for index, row in df.iterrows():
        probabilities = calculate_probabilities(row['story'], model, tokenizer, history_sizes)
        df = add_probabilities_to_df(df, index, probabilities)
    df = calculate_sequentiality(df, history_sizes)

    # Extract linguistic features
    df = extract_linguistic_features(df)
    
    # Prepare the validation set
    feature_cols = ['word_count', 'sentence_count', 'avg_word_length', 'lexical_diversity', 'punctuation_count'] + \
                   [f'probability_history_size{i}' for i in history_sizes] + \
                   [f'Sequentiality_{i}' for i in range(1, 6)]
    X = df[feature_cols]
    y = df['memType']  # Assuming 'memType' is the column indicating recalled or imagined
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and validate the classification model
    accuracy = train_and_validate(X_train, X_val, y_train, y_val)
    


# Execute the pipeline
if __name__ == "__main__":
    run_pipeline()

In [None]:
print(f"Validation Accuracy: {accuracy}")

## Classifier model with 10 Lexical Features

In [None]:
def count_words(story):
    return len(story.split())

def count_sentences(story):
    return len(sent_tokenize(story))

def average_word_length(story):
    words = story.split()
    return sum(len(word) for word in words) / len(words) if words else 0

def lexical_diversity(story):
    words = story.split()
    return len(set(words)) / len(words) if words else 0

def punctuation_count(story):
    return sum(1 for char in story if char in string.punctuation)

def average_sentence_length(story):
    words = story.split()
    sentences = sent_tokenize(story)
    return len(words) / len(sentences) if sentences else 0

def count_sensory_words(story, sensory_words):
    words = word_tokenize(story)
    return sum(word.lower() in sensory_words for word in words)

def count_first_person_pronouns(story, pronouns):
    words = word_tokenize(story)
    return sum(word.lower() in pronouns for word in words)

def count_emotion_sentences(stories):
    return sum(TextBlob(sentence).sentiment.polarity != 0 for sentence in sent_tokenize(stories))

def count_dialogue_tags(story, tags):
    words = word_tokenize(story)
    tagged_words = pos_tag(words)
    return sum(tag.startswith('VB') and word.lower() in tags for word, tag in tagged_words)

def count_past_tense_verbs(story):
    words = word_tokenize(story)
    tagged_words = pos_tag(words)
    return sum(tag in ['VBD', 'VBN'] for word, tag in tagged_words)

In [None]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import string
import nltk

# Ensure necessary NLTK downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize GPT-2 model and tokenizer
def init_model():
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model.eval()
    return model, tokenizer

# Function to calculate probabilities
def calculate_probabilities(text, model, tokenizer, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}
    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)
    print(probabilities)
    return probabilities
    

# Function to add probabilities to DataFrame
def add_probabilities_to_df(df, index, probabilities):
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None
    return df

# Function to calculate Sequentiality scores
def calculate_sequentiality(df, history_sizes):
    for h in history_sizes:
        if h == 0: continue
        seq_key = f'Sequentiality_{h}'
        df[seq_key] = df[f'probability_history_size{h}'] - df['probability_history_size0']
    return df

# Function to extract linguistic features
def extract_linguistic_features(df):
    df['punctuation_count'] = df['story'].apply(lambda x: sum(1 for char in x if char in string.punctuation))
    df['sensory_word_count'] = df['story'].apply(lambda x: sum(word in {'see', 'hear', 'touch', 'taste', 'smell', 'sight', 'sound', 'texture', 'aroma', 'flavor'} for word in x.split()))
    df['first_person_pronoun_count'] = df['story'].apply(lambda x: sum(word.lower() in {'i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'} for word in x.split()))
    df['word_count'] = df['story'].apply(lambda x: len(x.split()))
    df['sentence_count'] = df['story'].apply(lambda x: len(sent_tokenize(x)))
    df['past_tense_verb_count'] = df['story'].apply(lambda x: sum(tag.startswith('VBD') for word, tag in nltk.pos_tag(nltk.word_tokenize(x))))
    df['emotion_word_count'] = df['story'].apply(lambda x: sum(word.lower() in {'happy', 'sad', 'angry', 'joyful', 'depressed', 'excited', 'fearful', 'anxious', 'content', 'disappointed'} for word in x.split()))
    df['dialogue_tag_count'] = df['story'].apply(lambda x: sum(word.lower() in {'said', 'asked', 'replied', 'shouted', 'whispered', 'murmured', 'screamed', 'yelled', 'muttered', 'uttered', 'exclaimed'} for word in x.split()))
    df['avg_word_length'] = df['story'].apply(lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0)
    df['lexical_diversity'] = df['story'].apply(lambda x: len(set(x.split())) / len(x.split()) if x.split() else 0)
    
    return df

# Function to train the classification model and validate it
def train_and_validate(X_train, X_val, y_train, y_val):
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train.fillna(0), y_train)
    y_pred = clf.predict(X_val.fillna(0))
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

# Main function to run the entire pipeline
def run_pipeline():
    # Load the dataset
    df = pd.read_csv("hippoCorpusV2.csv").head(3500)

    # Initialize model
    model, tokenizer = init_model()

    # Process each story and calculate probabilities and Sequentiality scores
    history_sizes = [0, 1, 2, 3, 4, 5]
    for index, row in df.iterrows():
        probabilities = calculate_probabilities(row['story'], model, tokenizer, history_sizes)
        df = add_probabilities_to_df(df, index, probabilities)
    df = calculate_sequentiality(df, history_sizes)

    # Extract linguistic features
    df = extract_linguistic_features(df)
    
    # Prepare the validation set
    feature_cols = ['word_count', 'sentence_count', 'avg_word_length', 'lexical_diversity', 'punctuation_count','past_tense_verb_count','emotion_word_count','first_person_pronoun_count',] + \
                   [f'probability_history_size{i}' for i in history_sizes] + \
                   [f'Sequentiality_{i}' for i in range(1, 6)]
    X = df[feature_cols]
    y = df['memType']  # Assuming 'memType' is the column indicating recalled or imagined
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and validate the classification model
    accuracy = train_and_validate(X_train, X_val, y_train, y_val)
    


# Execute the pipeline
if __name__ == "__main__":
    run_pipeline()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
def load_data(filepath):
    return pd.read_csv(filepath)

def preprocess_data(dataframe, feature_cols, target, class_map):
    features = dataframe[feature_cols + [target]]
    imputer = SimpleImputer(strategy='median')
    features[feature_cols] = imputer.fit_transform(features[feature_cols])
    features[target] = features[target].map(class_map)
    return features

def split_data(features, target, test_size, stratify, random_state):
    X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=test_size, stratify=stratify, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=random_state)
    return X_train, X_val, X_test, y_train, y_val, y_test

def standardize_and_poly_transform(X_train, X_val, degree):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly.fit_transform(X_train_scaled)
    X_val_poly = poly.transform(X_val_scaled)
    return X_train_poly, X_val_poly

def perform_grid_search(X_train, y_train, param_grid, cv_splits, scoring_method):
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=StratifiedKFold(cv_splits), scoring=scoring_method)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

def evaluate_model(model, X_val, y_val):
    predictions = model.predict(X_val)
    return accuracy_score(y_val, predictions)

# Define constants and parameters
DATA_FILEPATH = "hippoCorpusV2.csv"
FEATURE_COLUMNS = ["probability_history_size0", "probability_history_size2","probability_history_size3", "probability_history_size4", "probability_history_size5","Sequentiality_2", "Sequentiality_3", "Sequentiality_4", "Sequentiality_5","word_count", "sentence_count", "avg_word_length", "lexical_diversity","first_person_pronoun_count", "past_tense_verb_count", "emotion_word_count","dialogue_tag_count","punctuation_count", "avg_sentence_length", "sensory_word_count"]
TARGET_COLUMN = 'memType'
CLASS_MAPPING = {'imagined': 0, 'recalled': 1}
TEST_SIZE = 0.4
RANDOM_STATE = 42
POLY_DEGREE = 2
PARAM_GRID = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
CV_SPLITS = 5
SCORING_METHOD = 'accuracy'

# Main script execution
if __name__ == "__main__":
    # Data loading and preprocessing
    story_data = load_data(DATA_FILEPATH)
    processed_data = preprocess_data(story_data, FEATURE_COLUMNS, TARGET_COLUMN, CLASS_MAPPING)
    
    # Data splitting
    X_train, X_val, _, y_train, y_val, _ = split_data(processed_data[FEATURE_COLUMNS], processed_data[TARGET_COLUMN], TEST_SIZE, processed_data[TARGET_COLUMN], RANDOM_STATE)
    
    # Feature scaling and transformation
    X_train_transformed, X_val_transformed = standardize_and_poly_transform(X_train, X_val, POLY_DEGREE)
    
    # Model training and hyperparameter tuning
    best_model = perform_grid_search(X_train_transformed, y_train, PARAM_GRID, CV_SPLITS, SCORING_METHOD)
    
    # Model evaluation
    validation_accuracy = evaluate_model(best_model, X_val_transformed, y_val)
    print(f"Validation Accuracy: {validation_accuracy:.2f}")


In [None]:
print(f"Validation Accuracy: {accuracy}")