<a href="https://colab.research.google.com/github/Sambosis/TryAgain/blob/main/createdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
import re
import pandas as pd
import numpy as np
import os
import json
import nltk
import warnings
import tqdm
from spacy import displacy
from collections import defaultdict
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import defaultdict

def extract_verb_object_pairs(doc):
    verb_object_pairs = []
    for token in doc:
        if "VERB" in token.pos_:
            # Extract the verb lemma
            verb_lemma = token.lemma_
            # Find the direct object of the verb
            dobj = [child for child in token.children if child.dep_ == 'dobj']
            if dobj:  # If a direct object is found
                phrase = ''.join(w.text_with_ws for w in dobj[0].subtree).strip()
                verb_object_pairs.append((verb_lemma, phrase))
    return verb_object_pairs


def load_texts_from_directory(directory_path):
    all_texts = []
    for root, dirs, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith('.txt'):
                with open(os.path.join(root, filename), 'r') as f:
                    all_texts.append(f.read())
    return all_texts

def sentiment_score(text):
    sentiment_score = sia.polarity_scores(text)['compound']
    return sentiment_score



def preprocess_text(raw_text):
    # Remove newline characters and replace with spaces
    cleaned_text = raw_text.replace('\n', ' ')

    # Use regex to insert spaces between concatenated words
    cleaned_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_text)

    # Use regex to insert spaces before words that start with a capital letter and follow a lowercase letter
    cleaned_text = re.sub(r'([a-z])([A-Z][a-z])', r'\1 \2', cleaned_text)
    # Various cleaning operations
    cleaned_text = re.sub(r'@\w+', '', cleaned_text)
    cleaned_text = re.sub(r'<[^>]+>', '', cleaned_text)
    cleaned_text = re.sub(r'@\s', '', cleaned_text)
    cleaned_text = re.sub(r'\\', '', cleaned_text)
    cleaned_text = re.sub(r'/', '', cleaned_text)

    # Handle contractions
    contractions = {
        r" n\\'t": "n't",
        r" \\'re": "'re",
        r" \\'s": "'s",
        r" \\'d": "'d",
        r" \\'ll": "'ll",
        r" \\'t": "'t",
        r" \\'ve": "'ve",
        r" \\'m": "'m"
    }

    for contraction, replacement in contractions.items():
        cleaned_text = re.sub(contraction, replacement, cleaned_text)
    cleaned_text = re.sub(r'\d', '', cleaned_text)
    cleaned_text = cleaned_text.replace('\r\n', ' ')
    return cleaned_text

def extract_verb_phrases_spacy(doc):
    verb_phrases = []
    for token in doc:
        # Check if the token is a verb
        if "VERB" in token.pos_:
            # Extract the verb itself
            phrase = token.text_with_ws

            # Find the direct object of the verb
            dobj = [child for child in token.children if child.dep_ == 'dobj']
            if dobj:  # If a direct object is found
                phrase += ''.join(w.text_with_ws for w in dobj[0].subtree)
                verb_phrases.append(phrase.strip())
    return verb_phrases

# For a given direct object, identify the most frequently associated verbs
def get_associated_verbs(direct_object, grouped_data):
    return [verb for verb, objs in grouped_data.items() if direct_object in objs]

def extract_and_group_verb_phrases_spacy(doc):
    verb_phrases = defaultdict(list)
    for token in doc:
        # Check if the token is a verb
        if "VERB" in token.pos_:
            # Extract the verb lemma
            verb_lemma = token.lemma_
            # Find the direct object of the verb
            dobj = [child for child in token.children if child.dep_ == 'dobj']
            if dobj:  # If a direct object is found
                phrase = ''.join(w.text_with_ws for w in dobj[0].subtree)
                verb_phrases[verb_lemma].append(phrase.strip())
    return verb_phrases

# This function splits the text into chunks that are smaller than a specified max length.
def split_text_into_chunks(text, max_length):
    words = text.split(' ')
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        if current_length + len(word) < max_length:
            current_chunk.append(word)
            current_length += len(word) + 1
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
    chunks.append(' '.join(current_chunk))
    return chunks

def process_chunk_and_extract_data(chunk):
    # Process the chunk with spaCy
    doc = nlp(chunk)

    # Extract verb-object pairs from the chunk
    verb_obj_pairs = extract_verb_object_pairs(doc)

    # Extract sentences containing the verb-object pairs
    sentences_with_vo = extract_sentences_with_verb_object_pairs(doc, verb_obj_pairs)

    # Convert the extracted data to a DataFrame
    df_chunk = pd.DataFrame(sentences_with_vo)

    return df_chunk


def extract_sentences_with_verb_object_pairs(doc, verb_obj_pairs):
    sentences_with_vo = []
    sents_list = list(doc.sents)  # Convert sentences to a list for easier indexing

    for idx, sent in enumerate(sents_list):
        for verb, obj in verb_obj_pairs:
            if verb in sent.text and obj in sent.text:
                # Extract preceding, action and following sentences
                preceding_sent = sents_list[idx-1].text if idx > 0 else ''
                action_sent = sent.text
                following_sent = sents_list[idx+1].text if idx < len(sents_list)-1 else ''

                # Calculate sentiment scores
                pre_sentiment = sentiment_score(preceding_sent)
                action_sentiment = sentiment_score(action_sent)
                post_sentiment = sentiment_score(following_sent)

                sentences_with_vo.append({
                    "Preceding Sentence": preceding_sent.strip(),
                    "Action Sentence": action_sent.strip(),
                    "Following Sentence": following_sent.strip(),
                    "Verb": verb,
                    "Object": obj,
                    "Pre-Sentiment Score": pre_sentiment,
                    "Action Sentiment Score": action_sentiment,
                    "Post-Sentiment Score": post_sentiment
                })

    return sentences_with_vo



In [None]:
nlp = spacy.load("en_core_web_sm")
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
texts = load_texts_from_directory('/content/drive/MyDrive/data')
raw_text = ''
with open("cleantext.txt", "w") as file:
  for text in texts:
    raw_text 3+= text


In [None]:
cleaned_text = preprocess_text(raw_text)

In [None]:
MAX_LENGTH = 1000000  # Adjust this based on your needs
chunks = split_text_into_chunks(cleaned_text, MAX_LENGTH)

In [None]:
all_dataframes = []

for chunk in chunks:
    df_chunk = process_chunk_and_extract_data(chunk)
    all_dataframes.append(df_chunk)

full_df = pd.concat(all_dataframes, ignore_index=True)
full_df.to_csv("action_states.csv")
full_df

In [None]:
full_df.to_csv("/content/drive/MyDrive/action_states.csv")