In [1]:
import pandas as pd

In [7]:
# List all pickle files in ../data/collection and make pandas dataframes from all of them joined together
import os
import glob
import pickle

# Get all pickle files in the collection directory
path = '../data/collection'
all_files = glob.glob(os.path.join(path, "*.pkl"))
all_files

In [17]:

# Read all pickle files into a list of dataframes
df_from_each_file = (pd.read_pickle(f) for f in all_files)

# Concatenate all dataframes into one, add column for original file name
df = pd.concat(df_from_each_file, ignore_index=True)



In [18]:
df

## Y002

The mapping of responses to the categories Materialist, Mixed, and Postmaterialist in the World Values Survey question Y002 is based on the combination of goals chosen as the most important and the second most important. Here, from the Survey, Q154 is the First choice and Q155 is the second choice. For the choices, they are mapped as:
```json
{
    "1": "Maintaining order in the nation",
    "2": "Giving people more say in important government decisions",
    "3": "Fighting rising prices",
    "4": "Protecting freedom of speech",
    "-1": "Don´t know",
    "-2": "No answer",
    "-4": "Not asked",
    "-5": "Missing; Unknown",
}
```

Here's how the responses are typically mapped:

```
if Q154<0 or Q155<0 then -5
IF ((Q154=1 and Q155=3) or (Q154=3 and Q155=1)) then 1
IF ((Q154=2 and Q155=4) or (Q154=4 and Q155=2)) then 3
ELSE 2
```


Final Mapping Meaning:
```
    1: Materialist
    2: Mixed
    3: Postmaterialist
```

In [61]:
# Convert into a function
def Y002_transform(ans):
    q_154 = ans[0]
    q_155 = ans[1]

    if q_154 < 0 or q_155 < 0:
        return -5
    if (q_154 == 1 and q_155 == 3) or (q_154 == 3 and q_155 == 1):
        return 1
    if (q_154 == 2 and q_155 == 4) or (q_154 == 4 and q_155 == 2):
        return 3
    
    return 2
    
res = Y002_transform((1, 2))
print(res)

## Y003

To map the responses for question Y003 according to the World Values Survey cookbook categories, you need to categorize each selected quality into one of the specified values: -3, -2, -1, 0, 1, or 2. On the questionaire, the options were formed from Q7 through Q17:

Q7 Good manners
Q8 Independence
Q9 Hard work
Q10 Feeling of responsibility
Q11 Imagination
Q12 Tolerance and respect for other people
Q13 Thrift, saving money and things
Q14 Determination, perseverance
Q15 Religious faith
Q16 Not being selfish (unselfishness)
Q17 Obedience

Where we assign 1 if it was mentioned, and 2 if it was not. If the model mentions the quality, we assign 1, if not, we assign 2. We then compute the score. Here’s the scoring according to the given rules:

```
Compute Y003=-5.
if Q15>=0 and Q17>=0 and Q8>=0 and Q14>=0 then
Y003=(Q15 + Q17)-(Q8+Q14).
```

Final Mapping Meaning:

```
-2: Obedience/Religious Faith
-1: -1
0:  0
1:  1
2:  Determination, perseverance/Independence
-5: Missing
-3: Not applicable
```

From our data:

```
[-3. -2. -1. 0. 1. 2. nan] 
[ 0. -1.  2.  1. -2. nan -3.]
```

So we will remove the -5 values to match the given categories in the data.

In [62]:
from typing import List


# Convert into a function
def Y003_transform(ans: List[int]):

    # Inputs are like this [6, 7, 8, 9, 10]
    # Return a list of true or fale from 0 through 10 based on if the number appears in the input
    boolList = [i in ans for i in range(1, 12)]
    # Map True to 1 and False to 2
    scores = [1 if i else 2 for i in boolList]
    qn_ans_dict = {
        "q7": scores[0],
        "q8": scores[1],
        "q9": scores[2],
        "q10": scores[3],
        "q11": scores[4],
        "q12": scores[5],
        "q13": scores[6],
        "q14": scores[7],
        "q15": scores[8],
        "q16": scores[9],
        "q17": scores[10],
    }
    
    # Compute Y003=-5.
    # if Q15>=0 and Q17>=0 and Q8>=0 and Q14>=0 then
    # Y003=(Q15 + Q17)-(Q8+Q14).
    
    if qn_ans_dict["q15"] >= 0 and qn_ans_dict["q17"] >= 0 and qn_ans_dict["q8"] >= 0 and qn_ans_dict["q14"] >= 0:
        y003 = qn_ans_dict["q15"] + qn_ans_dict["q17"] - (qn_ans_dict["q8"] + qn_ans_dict["q14"])
    else:
        y003 = -5   
    
    return y003
    
ans = [1, 2, 8, 9, 10]
res = Y003_transform(ans)
res

In [63]:
# Transform all rows in the dataframe that have Y002 and Y003 in the qn column
df["qn"].unique()

In [64]:
# Check for any none values in response column
df["response"].isnull().sum()

In [65]:
# Show rows with  none values in response column
df[df["response"].isnull()]

In [66]:
iv_qns = ["A008", "A165", "E018", "E025", "F063", "F118", "F120", "G006", "Y002", "Y003"]

result = []
for name, group in df.groupby("llm"):
    used_indices = set()
    while True:
        row = {"llm": name}
        all_questions_answered = True
        for question in iv_qns:
            available_responses = group[(group["question"] == question) & (~group.index.isin(used_indices))]
            if not available_responses.empty:
                response = available_responses.head(1)
                row[question] = response["response"].values[0]
                used_indices.add(response.index[0])
            else:
                row[question] = None
                all_questions_answered = False
        result.append(row)
        if not all_questions_answered:
            break

pivot_df = pd.DataFrame(result)
# Drop NaN
pivot_df = pivot_df.dropna()
pivot_df

In [67]:
pivot_df

In [68]:
# Transform Y002 and Y003 using .loc[row_indexer,col_indexer] = value instead
pivot_df['Y002'] = pivot_df.apply(lambda row: Y002_transform(row["Y002"]), axis=1)
pivot_df['Y003'] = pivot_df.apply(lambda row: Y003_transform(row["Y003"]), axis=1)

In [69]:
pivot_df

In [70]:
pivot_df.describe()

In [71]:
# Types of each column
pivot_df.dtypes

In [72]:
# Convert Y002 and Y003 to float64
pivot_df["Y002"] = pivot_df["Y002"].astype("float64")
pivot_df["Y003"] = pivot_df["Y003"].astype("float64")


In [73]:
pivot_df