In [19]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk import ngrams
import re
import spacy
import pandas as pd
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [20]:
# Download required resources
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the CSV file
file_path = "konbert-output-ad705d9f.csv"  # Update with your actual file path
df = pd.read_csv(file_path)
df.drop(columns=['f'], inplace=True)
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to apply stemming
def stem_text(text):
    return " ".join([stemmer.stem(word) for word in str(text).split()])

# Function to apply lemmatization
def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in str(text).split()])

# Create a new DataFrame for transformed text
df_transformed = df.copy()

# Apply stemming first, then lemmatization (or reverse if needed)
for col in df.select_dtypes(include=['object']).columns:
    df_transformed[col] = df[col].apply(stem_text).apply(lemmatize_text)

# Save the transformed DataFrame
df_transformed.to_csv("transformed_data.csv", index=False)

print("Transformed data saved as 'transformed_data.csv'")


Transformed data saved as 'transformed_data.csv'


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [21]:
# Initialize NLP tools
stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")
nltk.download(['punkt', 'wordnet', 'stopwords'])

# Load datasets
#df1 = pd.read_csv('/content/transformed_data.csv')  # Dataset 1 (Action Verbs & Categories)
df1 = df_transformed.copy()
df2 = pd.read_csv('combined_dataset.csv')        # Dataset 2 (Questions & BT Levels)

# Assign unique IDs to each question
df2['Question_ID'] = range(1, len(df2) + 1)

# Extract unique action verbs from df1
action_verbs = pd.unique(df1[['Knowledge', 'Comp.', 'Appl.', 'Analysis', 'Eval.', 'Create']].values.ravel())
action_verbs = [verb.strip().lower() for verb in action_verbs if pd.notna(verb)]
print(action_verbs)
action_verbs = [stemmer.stem(nlp(verb)[0].lemma_) for verb in action_verbs]
print(action_verbs)

# Function to generate bigrams
def generate_bigrams(text):
    words = re.findall(r'\w+', text.lower())  # Tokenize words
    words = [stemmer.stem(nlp(word)[0].lemma_) for word in words]  # Apply stemming & lemmatization
    return [' '.join(bigram) for bigram in ngrams(words, 2)]

# Match action verbs with questions
matches = []
for _, row in df2.iterrows():
    question_id = row['Question_ID']
    question_text = row['QUESTION']
    question_bigrams = generate_bigrams(question_text)
    bt_level = row['BT LEVEL']

    for verb in action_verbs:
        if any(verb in bigram for bigram in question_bigrams):
            matches.append((question_id, question_text, verb, bt_level))

# Create matches DataFrame
matches_df = pd.DataFrame(matches, columns=['Question_ID', 'Question', 'Matched Action Verb', 'BT Level'])

# Function to find category for a matched action verb
def find_category(verb):
    for column in df1.columns:  # Search in all category columns
        column_values = df1[column].dropna().astype(str).str.lower().tolist()  # Ensure all values are strings
        if verb in column_values:
            return column  # Return the category name
    return None


# Map action verbs to categories
matches_df['Category'] = matches_df['Matched Action Verb'].apply(find_category)

# Final structured DataFrame
final_df = matches_df[['Question_ID', 'Question', 'Matched Action Verb', 'Category', 'BT Level']]

# Display final dataframe
print(final_df)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['arrang', 'articul', 'act', 'analyz', 'apprais', 'choos', 'associ', 'adapt', 'argu', 'assembl', 'cite', 'character', 'appli', 'break', 'categor', 'copi', 'back/back up', 'break down', 'assess', 'defin', 'clarifi', 'calcul', 'attach', 'collect', 'describ', 'classifi', 'chang', 'combin', 'draw', 'compar', 'compil', 'duplic', 'contrast', 'conclud', 'compos', 'identifi', 'convert', 'complet', 'consid', 'construct', 'indic', 'defend', 'comput', 'core', 'creat', 'label', 'demonstr', 'correl', 'counsel', 'design', 'list', 'critic', 'develop', 'match', 'differenti', 'debat', 'critiqu', 'devi', 'discus', 'discov', 'deduc', 'estim', 'memor', 'distinguish', 'dramat', 'detect', 'decid', 'evalu', 'name', 'employ', 'diagnos', 'explain', 'order', 'experi', 'diagram', 'facilit', 'outlin', 'express', 'formul', 'quot', 'extend', 'gener', 'determin', 'read', 'extrapol', 'discrimin', 'recal', 'illustr', 'dissect', 'hypothes', 'recit', 'give', 'implement', 'improv', 'recogn', 'give exampl', 'interpret', '

In [22]:
final_df.to_csv("final_df.csv", index=False)