In [None]:
import pandas as pd
import spacy
from itertools import chain
# from distinct_n import distinct_n_sentence_level  

In [2]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [3]:
# Function to calculate sentence length
def sentence_length(text):
    sen_length = len(text.split())
    return sen_length



In [None]:
# For general, overall comparison
df = pd.read_csv('/projects/humansVsLLMs/data/data.csv')
for text in df['value']:
    sentence_len = sentence_length(text)

In [None]:
# For gender wise comparison
path = '/projects/humansVsLLMs/data'
df = pd.read_csv(f"{path}/data_leaders_with_demographics_semantics.csv")  # Replace with your file
df = df[df['GenderIdentity'] == 'Female']
response_texts_uniqueness = df['firstTaskGoal'].dropna().astype(str).to_list()
print(len(response_texts_uniqueness))
response_texts_belongingness = df['addFirstRelGoal'].dropna().astype(str).to_list()
print(len(response_texts_belongingness))
response_texts = response_texts_belongingness + response_texts_uniqueness
print(len(response_texts))

In [4]:
# Function to calculate Parts of Speech (POS) distribution
def pos_distribution(text):
    doc = nlp(text)
    pos_counts = {pos: 0 for pos in ["NOUN", "VERB", "ADJ", "ADV", "PRON", "DET", "ADP", "CONJ", "NUM", "PUNCT"]}
    total_tokens = len(doc)
    
    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1
    
    return {pos: count / total_tokens if total_tokens > 0 else 0 for pos, count in pos_counts.items()}

In [5]:
# Function to calculate Words per Conversation Statement
def words_per_statement(text):
    sentences = list(nlp(text).sents)
    return sum(len(sent.text.split()) for sent in sentences) / len(sentences) if sentences else 0


In [6]:

# Process dataset
def process_text_data(file_path, column: None):
    results = []
    if column is not None:
        df = pd.read_csv(file_path)
        df = df.dropna(subset=[column])  # Remove rows where 'Text' column is empty
        
        results = []
        for text in df[column]:
            sentence_len = sentence_length(text)
            pos_dist = pos_distribution(text)
            # lex_div = lexicon_diversity(text)
            words_per_stmt = words_per_statement(text)
            
            results.append({
                "Sentence Length": sentence_len,
                # "Lexicon Diversity": lex_div,
                "Words per Statement": words_per_stmt,
                **pos_dist  # Expanding POS distribution dictionary
            })
        
        # Convert results into DataFrame and normalize by sentence length
        results_df = pd.DataFrame(results)
        # results_df = results_df.div(results_df["Sentence Length"], axis=0)
        # results_df.fillna(0, inplace=True)  # Handle division by zero cases
    else:
        pass # Run the below text for gender-wise data. Pass texts as a list
        # results = []
        # for text in texts:
        #     sentence_len = sentence_length(text)
        #     pos_dist = pos_distribution(text)
        #     # lex_div = lexicon_diversity(text)
        #     words_per_stmt = words_per_statement(text)
            
        #     results.append({
        #         "Sentence Length": sentence_len,
        #         # "Lexicon Diversity": lex_div,
        #         "Words per Statement": words_per_stmt,
        #         **pos_dist  # Expanding POS distribution dictionary
        #     })
        
        # # Convert results into DataFrame and normalize by sentence length
        # results_df = pd.DataFrame(results)
    
    return results_df

In [None]:
if __name__ == "__main__":
      input_file = "/projects/humansVsLLMs/data/data.csv"  # Replace with actual file path
      result_df = process_text_data(input_file, column=None, texts=response_texts)
      result_df.to_csv("/projects/humansVsLLMs/results/text_analysis_results_new.csv", index=False)


# LLM-generated outputs

In [None]:
import pandas as pd
model = 'gemini'
path = "/projects/humansVsLLMs/results/4-Dimensions-0Shot-GeneratedResponses"
# List of CSV file paths
csv_files = [f"{path}/generated_responses_Generated_Prompt_Uniqueness_{model}_zeroShot.csv",
             f"{path}/generated_responses_Generated_Prompt_Belongingness_{model}_zeroShot.csv",
             f"{path}/generated_responses_Generated_Prompt_Appreciation_{model}_zeroShot.csv", 
             f"{path}/generated_responses_Generated_Prompt_OrgEfforts_{model}_zeroShot.csv"]

# Initialize an empty list to store DataFrames
dataframes = []

# Read each CSV file and append to the list
for file in csv_files:
    df = pd.read_csv(file)  # Read CSV file
    dataframes.append(df)   # Append DataFrame to list

# Concatenate all DataFrames into a single DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)

# Display the final DataFrame
print(merged_df.shape)

# Save as csv
merged_df.to_csv(f'/projects/humansVsLLMs/results/0-shot-generated-responses/{model}_generated_responses.csv', index=False)

In [None]:
if __name__ == "__main__":
    model = 'gemini'
    input_file = f"/projects/humansVsLLMs/results/0-shot-generated-responses/{model}_generated_responses.csv"  # Replace with actual file path
    result_df = process_text_data(input_file, column='Response')
    result_df.to_csv(f"/projects/humansVsLLMs/results/syntactic_analysis/0-shot/{model}_text_analysis.csv", index=False)
    

# Saving excel data to csv

In [None]:
df = pd.read_excel('/projects/humansVsLLMs/data/Oct22_Jap_goals.xlsx')
df.head()

In [None]:
df.shape

In [None]:
cols = ['firstTaskGoal', 'addTaskGoals', 'addFirstRelGoal', 'addRelGoals', 'addRelGoalsLater']
df_new = df[cols]
df_new.shape

In [None]:
df_new = df_new.reset_index()
df_new = df_new.drop(columns=['level_0'])

df_newer = df_new.melt(id_vars=['index']).value.dropna() 
print(df_newer.shape)
df_newer.head()

In [22]:
df_newer.to_csv('/projects/humansVsLLMs/data/data.csv', index=False)