In [1]:
import os
import glob
import spacy
import pandas as pd
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

nlp = spacy.load("pt_core_news_md")

directories = [] # This variable should contain the subcorpus directories.

passes = 10  # for LDA

csv_path = "all-features-structures-hl2c.csv"

def read_text_files_from_directories(dir_list):
    texts = []
    filenames = []
    for d in dir_list:
        txt_files = glob.glob(os.path.join(d, "*.txt"))
        for fpath in txt_files:
            with open(fpath, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read()
                texts.append(content)
                filenames.append(os.path.basename(fpath))
    return texts, filenames

def preprocess_text(texts):
    processed_docs = []
    for doc in nlp.pipe(texts, disable=["ner", "parser"]):
        tokens = [
            token.lemma_.lower() 
            for token in doc 
            if not token.is_stop
            and token.is_alpha
        ]
        processed_docs.append(tokens)
    return processed_docs


raw_texts, filenames = read_text_files_from_directories(directories)
processed_docs = preprocess_text(raw_texts)
dictionary = Dictionary(processed_docs)
# Filter out words that occur in less than 5 documents or more than 50% of documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Convert documents into a bag-of-words format
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Range of number of topics
topic_nums = range(2, 11)  # from 2 to 10 topics
coherence_scores = []

# Evaluate coherence for different numbers of topics
for k in topic_nums:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=k, passes=passes, random_state=42)

    # Compute Coherence Score using C_v coherence
    coherence_model = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append((k, coherence_score))

    print(f"Number of Topics: {k} \tCoherence Score (C_v): {coherence_score:.4f}")

# Find the best number of topics
best_k, best_score = max(coherence_scores, key=lambda x: x[1])
print(f"\nBest number of topics: {best_k} with Coherence Score: {best_score:.4f}")

# Retrain the model with the best number of topics
best_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=best_k, passes=passes, random_state=42)

# Print the most representative words for each topic in the best model
print("\nTopics for the best model:")
for idx, topic in best_model.print_topics(num_topics=best_k, num_words=10):
    print(f"Topic {idx}: {topic}")

# Assign topics to each document
doc_topics = []
for doc_bow in corpus:
    # get_document_topics returns a list of (topic_id, probability)
    topic_distribution = best_model.get_document_topics(doc_bow, minimum_probability=0.0)
    # Find the topic with the highest probability
    dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
    doc_topics.append(dominant_topic)

# Now we have a list `doc_topics` with the topic assigned to each document
# and a list `filenames` that matches the index of doc_topics.


df = pd.read_csv(csv_path)
# Create a dictionary mapping from filename to topic
filename_to_topic = {fn: t for fn, t in zip(filenames, doc_topics)}

# Map each row's Text_Title to its topic
df['Topic'] = df['Text_Title'].map(filename_to_topic)

col_order = list(df.columns)
prof_index = col_order.index('Proficiency')
# Remove Topic from the end
col_order.remove('Topic')
# Insert after Proficiency
col_order.insert(prof_index + 1, 'Topic')
df = df[col_order]

# Save the modified dataframe
df.to_csv("features-structures-topics-hl2c.csv", index=False)
print("CSV updated with topic assignments.")


Number of Topics: 2 	Coherence Score (C_v): 0.3151
Number of Topics: 3 	Coherence Score (C_v): 0.3367
Number of Topics: 4 	Coherence Score (C_v): 0.3596
Number of Topics: 5 	Coherence Score (C_v): 0.3638
Number of Topics: 6 	Coherence Score (C_v): 0.3850
Number of Topics: 7 	Coherence Score (C_v): 0.3918
Number of Topics: 8 	Coherence Score (C_v): 0.3916
Number of Topics: 9 	Coherence Score (C_v): 0.4250
Number of Topics: 10 	Coherence Score (C_v): 0.4227

Best number of topics: 9 with Coherence Score: 0.4250

Topics for the best model:
Topic 0: 0.044*"dia" + 0.028*"ano" + 0.021*"ir" + 0.020*"ter" + 0.015*"amigo" + 0.014*"casa" + 0.013*"pai" + 0.013*"vida" + 0.012*"bom" + 0.011*"eu"
Topic 1: 0.040*"tecnologia" + 0.039*"biblioteca" + 0.030*"dia" + 0.022*"jovem" + 0.016*"computador" + 0.016*"rede" + 0.016*"achar" + 0.015*"uso" + 0.015*"escrever" + 0.014*"ir"
Topic 2: 0.101*"hotel" + 0.038*"ter" + 0.021*"ficar" + 0.016*"gostar" + 0.015*"comida" + 0.014*"tambem" + 0.012*"ser" + 0.011*"pequ

In [2]:
df

Unnamed: 0,Text_Title,Country,Proficiency,Topic,Cohesive Complexity Feature: Difficult Connectives per Token,Cohesive Complexity Feature: Easy Connectives per Token,Cohesive Complexity Feature: Mendes Additive Connectives per Token,Cohesive Complexity Feature: Mendes All Connectives per Token,Cohesive Complexity Feature: Mendes Causal Connectives per Token,Cohesive Complexity Feature: Mendes Concessive Connectives per Token,...,Grammatical Complexity: Particípio passado - forma irregular de particípio per verb tokens,Grammatical Complexity: Particípio passado - particípios irregulares per verb tokens,Grammatical Complexity: Particípio passado - particípios regulares per verb tokens,Grammatical Complexity: Pretérito imperfeito do conjuntivo - verbos irregulares per verb tokens,Grammatical Complexity: Pretérito imperfeito do conjuntivo - verbos regulares per verb tokens,Grammatical Complexity: Pretérito imperfeito do indicativo - verbos irregulares per verb tokens,Grammatical Complexity: Pretérito imperfeito do indicativo - verbos regulares per verb tokens,Grammatical Complexity: Pretérito mais-que-perfeito simples do indicativo per verb tokens,Grammatical Complexity: Pretérito perfeito composto do conjuntivo per verb tokens,Grammatical Complexity: Pretérito perfeito simples do indicativo - verbos irregulares per verb tokens
0,AND_20B1AA_0121.txt,AND,B1,,0.015385,0.069231,0.015385,0.092308,0.023077,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.076923,0.0,0.0,0.00000
1,AND_20B1AA_0810.txt,AND,B1,,0.034483,0.080460,0.034483,0.126437,0.022989,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000
2,AND_20B1AA_0867.txt,AND,B1,,0.000000,0.041667,0.031250,0.083333,0.010417,0.000000,...,0.0,0.000000,0.115385,0.000000,0.000000,0.0,0.038462,0.0,0.0,0.00000
3,AND_20B1AA_4710.txt,AND,B1,,0.016260,0.056911,0.016260,0.081301,0.016260,0.000000,...,0.0,0.000000,0.080000,0.000000,0.000000,0.0,0.040000,0.0,0.0,0.08000
4,AND_20B1AA_5817.txt,AND,B1,,0.006623,0.086093,0.033113,0.125828,0.006623,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,UK_22C1AE_82X7.txt,UK,C1,5.0,0.005556,0.094444,0.061111,0.133333,0.005556,0.000000,...,0.0,0.048780,0.024390,0.024390,0.024390,0.0,0.024390,0.0,0.0,0.02439
522,UK_22C1AE_8423.txt,UK,C1,4.0,0.030120,0.096386,0.036145,0.108434,0.018072,0.006024,...,0.0,0.057143,0.142857,0.000000,0.028571,0.0,0.000000,0.0,0.0,0.00000
523,UK_22C1AE_8605.txt,UK,C1,8.0,0.017045,0.085227,0.022727,0.107955,0.011364,0.000000,...,0.0,0.000000,0.083333,0.027778,0.000000,0.0,0.000000,0.0,0.0,0.00000
524,UK_22C1AE_8714.txt,UK,C1,5.0,0.018405,0.073620,0.030675,0.079755,0.006135,0.000000,...,0.0,0.000000,0.083333,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000


In [3]:
topic_descriptions = {
    0: "Personal Life and Relationships",
    1: "Technology, Libraries, and Youth",
    2: "Travel and Accommodation",
    3: "Art, Tourism, and Cultural Activities",
    4: "Global and Virtual Realities",
    5: "Books, Culture, and Leisure",
    6: "Nature and Outdoor Photography",
    7: "Tablets, Education, and Everyday Tech",
    8: "Work, Projects, and Global Engagement"
}

# Replace numeric topics with descriptive labels
df["Topic"] = df["Topic"].map(topic_descriptions)

# General topic classification based on descriptive topic names
general_topic_mapping = {
    "Personal Life and Relationships": "Personal, Cultural, and Recreational Life",
    "Books, Culture, and Leisure": "Personal, Cultural, and Recreational Life",
    "Art, Tourism, and Cultural Activities": "Personal, Cultural, and Recreational Life",
    "Travel and Accommodation": "Personal, Cultural, and Recreational Life",
    "Nature and Outdoor Photography": "Personal, Cultural, and Recreational Life",
    "Tablets, Education, and Everyday Tech": "Technology and Digital Environments",
    "Technology, Libraries, and Youth": "Technology and Digital Environments",
    "Global and Virtual Realities": "Technology and Digital Environments",
    "Work, Projects, and Global Engagement": "Work and Professional Engagement"
}

# Map the general topics based on the updated "Topic" column
df["General Topic"] = df["Topic"].map(general_topic_mapping)

# Reorder columns so that "General Topic" comes right after "Topic"
col_order = list(df.columns)
topic_index = col_order.index('Topic')
# Remove General Topic from the end if it's there
col_order.remove('General Topic')
# Insert it right after Topic
col_order.insert(topic_index + 1, 'General Topic')
df = df[col_order]


In [4]:
df

Unnamed: 0,Text_Title,Country,Proficiency,Topic,General Topic,Cohesive Complexity Feature: Difficult Connectives per Token,Cohesive Complexity Feature: Easy Connectives per Token,Cohesive Complexity Feature: Mendes Additive Connectives per Token,Cohesive Complexity Feature: Mendes All Connectives per Token,Cohesive Complexity Feature: Mendes Causal Connectives per Token,...,Grammatical Complexity: Particípio passado - forma irregular de particípio per verb tokens,Grammatical Complexity: Particípio passado - particípios irregulares per verb tokens,Grammatical Complexity: Particípio passado - particípios regulares per verb tokens,Grammatical Complexity: Pretérito imperfeito do conjuntivo - verbos irregulares per verb tokens,Grammatical Complexity: Pretérito imperfeito do conjuntivo - verbos regulares per verb tokens,Grammatical Complexity: Pretérito imperfeito do indicativo - verbos irregulares per verb tokens,Grammatical Complexity: Pretérito imperfeito do indicativo - verbos regulares per verb tokens,Grammatical Complexity: Pretérito mais-que-perfeito simples do indicativo per verb tokens,Grammatical Complexity: Pretérito perfeito composto do conjuntivo per verb tokens,Grammatical Complexity: Pretérito perfeito simples do indicativo - verbos irregulares per verb tokens
0,AND_20B1AA_0121.txt,AND,B1,,,0.015385,0.069231,0.015385,0.092308,0.023077,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.076923,0.0,0.0,0.00000
1,AND_20B1AA_0810.txt,AND,B1,,,0.034483,0.080460,0.034483,0.126437,0.022989,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000
2,AND_20B1AA_0867.txt,AND,B1,,,0.000000,0.041667,0.031250,0.083333,0.010417,...,0.0,0.000000,0.115385,0.000000,0.000000,0.0,0.038462,0.0,0.0,0.00000
3,AND_20B1AA_4710.txt,AND,B1,,,0.016260,0.056911,0.016260,0.081301,0.016260,...,0.0,0.000000,0.080000,0.000000,0.000000,0.0,0.040000,0.0,0.0,0.08000
4,AND_20B1AA_5817.txt,AND,B1,,,0.006623,0.086093,0.033113,0.125828,0.006623,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,UK_22C1AE_82X7.txt,UK,C1,"Books, Culture, and Leisure","Personal, Cultural, and Recreational Life",0.005556,0.094444,0.061111,0.133333,0.005556,...,0.0,0.048780,0.024390,0.024390,0.024390,0.0,0.024390,0.0,0.0,0.02439
522,UK_22C1AE_8423.txt,UK,C1,Global and Virtual Realities,Technology and Digital Environments,0.030120,0.096386,0.036145,0.108434,0.018072,...,0.0,0.057143,0.142857,0.000000,0.028571,0.0,0.000000,0.0,0.0,0.00000
523,UK_22C1AE_8605.txt,UK,C1,"Work, Projects, and Global Engagement",Work and Professional Engagement,0.017045,0.085227,0.022727,0.107955,0.011364,...,0.0,0.000000,0.083333,0.027778,0.000000,0.0,0.000000,0.0,0.0,0.00000
524,UK_22C1AE_8714.txt,UK,C1,"Books, Culture, and Leisure","Personal, Cultural, and Recreational Life",0.018405,0.073620,0.030675,0.079755,0.006135,...,0.0,0.000000,0.083333,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000


In [8]:
df.to_csv("features-structures-topics-hl2c.csv", index=False)

# Calculating Number of Texts and Average Number of Words

In [16]:
import os
import glob

# Directories to read text files from
directories = []

# These files didn't contain any text.
files_to_exclude = {
    "DE_20B1BZ_4833.txt",
    "DE_21B2AE_1MX1.txt",
    "DE_21C1AE_5146.txt",
    "DE_22B1BA_FVJV.txt",
    "UK_21B1AZ_3994.txt",
    "UK_22B1BE_2259.txt",
    "UK_22B1BE_8732.txt"
}

def read_text_files_from_directories(dir_list, exclude_files):
    texts = []
    total_files = 0
    file_counts_by_dir = {}
    excluded_count = 0
    
    for d in dir_list:
        txt_files = glob.glob(os.path.join(d, "*.txt"))
        dir_file_count = 0
        dir_name = os.path.basename(os.path.dirname(d))
        
        for fpath in txt_files:
            filename = os.path.basename(fpath)
            if filename in exclude_files:
                excluded_count += 1
                continue
                
            with open(fpath, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read()
                texts.append(content)
                dir_file_count += 1
        
        total_files += dir_file_count
        file_counts_by_dir[dir_name] = dir_file_count
    
    return texts, total_files, file_counts_by_dir, excluded_count



# Read raw texts from specified directories
raw_texts, total_text_count, files_per_dir, excluded_count = read_text_files_from_directories(
    directories, files_to_exclude)

# Calculate the word count for each text
word_counts = [len(text.split()) for text in raw_texts]

# Print the results
print(f"Total number of texts processed: {total_text_count}")
print(f"Number of texts excluded: {excluded_count}")
print("\nNumber of texts by directory:")
for dir_name, count in files_per_dir.items():
    print(f"{dir_name}: {count} texts")

# Calculate the average word count
if word_counts:
    average_word_count = sum(word_counts) / len(word_counts)
    print(f"\nAverage Word Count: {average_word_count:.2f}")
else:
    print("\nNo texts found in the specified directories.")

Total number of texts processed: 472
Number of texts excluded: 7

Number of texts by directory:
DE_B1_TRANSCRIBED: 102 texts
DE_B2_TRANSCRIBED: 100 texts
DE_C1_TRANSCRIBED: 101 texts
UK_B1_TRANSCRIBED: 90 texts
UK_B2_TRANSCRIBED: 37 texts
UK_C1_TRANSCRIBED: 42 texts

Average Word Count: 162.03
