# Commonsense statements cleaning & preprocessing

## Libraries and setup

run the following cell to import the necessary libraries and set up the environment.


In [89]:
# Data Processing
import pandas as pd
import numpy as np
import os
import openai
import csv


# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split

from scipy.stats import randint
from statistics import mode

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz


openai.organization = os.getenv("OPENAI_ORGANIZATION")
openai.api_key = os.getenv("OPENAI_API_KEY")

## Looking into data and preprocessing

we will import the cleaned statements and look into the data. We will also preprocess the data to make it ready for the model.

In [14]:
cleaned_statements_df = pd.read_csv('statements.csv')
statement_properties_df = pd.read_csv('statement_properties.csv')

## Getting the embeddings for the fixed statements via OpenAI API

Run the first cell to get the embeddings from openAI API. This will take a while (roughly 20 minutes). The embeddings will be saved in the embedded_statements.

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']


cleaned_statements_df['embeddings'] = cleaned_statements_df['fixed statement'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
cleaned_statements_df.to_pickle('embedded_statements.pkl')

In [39]:
embedded_statements = pd.read_pickle('embedded_statements.pkl')

In [93]:
merged_df = statement_properties_df.merge(embedded_statements, left_index=True, right_index=True)
merged_df['embeddings'] = merged_df['embeddings'].apply(lambda x: np.array(x))
merged_df.head(5)

Unnamed: 0,statement_number,statement_x,behavior,everyday,figure_of_speech,judgment,opinion,reasoning,category,elicitation,statement_y,fixed statement,embeddings
0,1,1 plus 1 is 2,0,1,0,0,0,1,Mathematics and logic,category response,1 plus 1 is 2,1 plus 1 equals 2.,"[0.030643712729215622, -0.004393580369651318, ..."
1,2,5 is alot bigger than 1,0,0,0,0,0,0,Mathematics and logic,category response,5 is alot bigger than 1,5 is significantly larger than 1.,"[-9.93039648164995e-05, 0.01369649637490511, 0..."
2,3,a balanced diet and regular exercise is needed...,1,1,0,1,0,1,Health and fitness,category response,a balanced diet and regular exercise is needed...,"To maintain good health, one needs a balanced ...","[0.011200563050806522, 0.004698386415839195, 0..."
3,4,a ball is round,0,1,0,0,0,0,Natural and physical sciences,Concept Net,a ball is round,A ball is round.,"[-0.004033376462757587, -4.661796992877498e-05..."
4,5,a baton twirler doesn't want a broken finger,0,1,0,1,1,0,Human activities,Concept Net,a baton twirler doesn't want a broken finger,A baton twirler wouldn't want to suffer a brok...,"[-0.02298833057284355, 0.006554496940225363, 0..."


In [61]:
merged_df.groupby('category').count()

Unnamed: 0_level_0,statement_number,statement_x,behavior,everyday,figure_of_speech,judgment,opinion,reasoning,elicitation,statement_y,fixed statement,embeddings
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Culture and the arts,326,326,326,326,326,326,326,326,326,326,326,326
General reference,780,780,780,780,780,780,780,780,780,780,780,780
Geography and places,128,128,128,128,128,128,128,128,128,128,128,128
Health and fitness,266,266,266,266,266,266,266,266,266,266,266,266
History and events,112,112,112,112,112,112,112,112,112,112,112,112
Human activities,622,622,622,622,622,622,622,622,622,622,622,622
Mathematics and logic,93,93,93,93,93,93,93,93,93,93,93,93
Natural and physical sciences,162,162,162,162,162,162,162,162,162,162,162,162
People and self,909,909,909,909,909,909,909,909,909,909,909,909
Philosophy and thinking,322,322,322,322,322,322,322,322,322,322,322,322


Spliting the data into train and test sets then run random forest

In [85]:
for category in merged_df['category'].unique():
    X_train = merged_df[merged_df['category'] != category].embeddings
    y_train = merged_df[merged_df['category'] != category].behavior

    X_test = merged_df[merged_df['category'] == category].embeddings.to_numpy()
    y_test = merged_df[merged_df['category'] == category].behavior.to_numpy()

    rf = RandomForestClassifier()
    rf.fit(list(X_train), y_train)
    y_pred = rf.predict(list(X_test))
    accuracy = accuracy_score(y_test, y_pred)
    print(category, " Accuracy: ", accuracy)

Mathematics and logic  Accuracy:  0.6451612903225806
Health and fitness  Accuracy:  0.6578947368421053
Natural and physical sciences  Accuracy:  0.6666666666666666
Human activities  Accuracy:  0.747588424437299
General reference  Accuracy:  0.5884615384615385
Religion and belief systems  Accuracy:  0.5986842105263158
People and self  Accuracy:  0.8063806380638063
Technology and applied sciences  Accuracy:  0.5128205128205128
Society and social sciences  Accuracy:  0.9474835886214442
History and events  Accuracy:  0.5267857142857143
Philosophy and thinking  Accuracy:  0.8633540372670807
Culture and the arts  Accuracy:  0.5276073619631901
Geography and places  Accuracy:  0.5390625


In [109]:
def do_global_r_sqaured(actual_values, model_predictions):
    # Calculate MSE for model and baseline
    baseline_predictions = [mode(actual_values)] * len(actual_values)
    mse_model = mean_squared_error(actual_values, model_predictions)
    mse_baseline = mean_squared_error(actual_values, baseline_predictions)

    # Calculate Global R-squared
    global_r_squared = 1 - (mse_model / mse_baseline)

    print(f'Mean Squared Error (Model): {mse_model}')
    print(f'Mean Squared Error (Baseline): {mse_baseline}')
    print(f'Global R-squared: {global_r_squared}')

In [111]:
outcomes = ['behavior', 'everyday', 'figure_of_speech', 'judgment', 'opinion', 'reasoning']

for outcome in outcomes:
    X_train = merged_df[merged_df['category'] != 'Society and social sciences'].embeddings
    y_train = merged_df[merged_df['category'] != 'Society and social sciences'][outcome]

    X_test = merged_df[merged_df['category'] == 'Society and social sciences'].embeddings.to_numpy()
    y_test = merged_df[merged_df['category'] == 'Society and social sciences'][outcome].to_numpy()

    rf = RandomForestClassifier()
    rf.fit(list(X_train), y_train)
    y_pred = rf.predict(list(X_test))
    accuracy = accuracy_score(y_test, y_pred)
    print(outcome, "- Accuracy:", accuracy)
    do_global_r_sqaured(y_test, y_pred)
    print('----------------------------')

behavior - Accuracy: 0.9387308533916849
Mean Squared Error (Model): 0.061269146608315096
Mean Squared Error (Baseline): 0.045951859956236324
Global R-squared: -0.33333333333333326
----------------------------
everyday - Accuracy: 0.6105032822757112
Mean Squared Error (Model): 0.38949671772428884
Mean Squared Error (Baseline): 0.4288840262582057
Global R-squared: 0.09183673469387754
----------------------------
figure_of_speech - Accuracy: 0.9212253829321663
Mean Squared Error (Model): 0.0787746170678337
Mean Squared Error (Baseline): 0.0787746170678337
Global R-squared: 0.0
----------------------------
judgment - Accuracy: 0.9387308533916849
Mean Squared Error (Model): 0.061269146608315096
Mean Squared Error (Baseline): 0.06345733041575492
Global R-squared: 0.034482758620689724
----------------------------
opinion - Accuracy: 0.8971553610503282
Mean Squared Error (Model): 0.10284463894967177
Mean Squared Error (Baseline): 0.10065645514223195
Global R-squared: -0.021739130434782483
----