In [2]:
########################### Python code for survey reduction ###########################
## By Trevor Coppins ##
## trevorcoppins@gmail.com ##

## see github page: https://github.com/TrevorCoppins/SurveyReductionCode

## DOCUMENTATION for programs used here ##
# scikit-criteria: https://scikit-criteria.quatrope.org/en/latest/index.html
# pandas: https://pandas.pydata.org/
# scipy: https://scipy.org/

# The following two sections of code install required packages
# The conda install of scikit-criteria  may take a minute or two to complete - this is normal

In [3]:
conda install -c conda-forge scikit-criteria

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Retrieving notices: ...working... done

Note: you may need to restart the kernel to use updated packages.




  current version: 4.14.0
  latest version: 22.11.1

Please update conda by running

    $ conda update -n base -c conda-forge conda




In [4]:
import pandas as pd
import skcriteria as skc

from scipy.stats import skew
from scipy.stats import kurtosis

In [5]:
# Data file #

# 1) load your own datafile here 
# OR
# 2) Utilize the practice dataset of 1000 responses of IPIP-50 made available at http://openpsychometrics.org/_rawdata/.
# For simplicity, we only utilized the 10-conscientious items (CSN)

## The original IPIP-50 survey can be found here: https://ipip.ori.org/New_IPIP-50-item-scale.htm ##

Data = pd.read_csv(r'InsertFilePathHere.csv')

In [6]:
Data

Unnamed: 0,CSN1,CSN2,CSN3,CSN4,CSN5,CSN6,CSN7,CSN8,CSN9,CSN10
0,3,4,3,2,2,4,4,2,4,4
1,3,2,5,3,3,1,3,3,5,3
2,4,2,2,2,3,3,4,2,4,2
3,2,4,4,4,1,2,2,3,1,4
4,5,1,5,1,3,1,5,1,5,5
...,...,...,...,...,...,...,...,...,...,...
995,5,5,5,1,3,1,3,1,5,5
996,4,1,3,3,3,2,4,2,5,3
997,4,1,3,1,3,5,1,5,5,5
998,4,2,4,2,3,1,5,2,5,5


In [7]:
## Recoding negatively keyed items ##

Data['CSN2'] = Data['CSN2'].replace({5:1, 4:2, 3:3, 2:4, 1:5})
Data['CSN4'] = Data['CSN4'].replace({5:1, 4:2, 3:3, 2:4, 1:5})
Data['CSN6'] = Data['CSN6'].replace({5:1, 4:2, 3:3, 2:4, 1:5})
Data['CSN8'] = Data['CSN8'].replace({5:1, 4:2, 3:3, 2:4, 1:5})

In [8]:
## Check to see our replacements have worked ##

print(Data)

## Check to see if any data is missing ##
print(Data.isnull().sum())

     CSN1  CSN2  CSN3  CSN4  CSN5  CSN6  CSN7  CSN8  CSN9  CSN10
0       3     2     3     4     2     2     4     4     4      4
1       3     4     5     3     3     5     3     3     5      3
2       4     4     2     4     3     3     4     4     4      2
3       2     2     4     2     1     4     2     3     1      4
4       5     5     5     5     3     5     5     5     5      5
..    ...   ...   ...   ...   ...   ...   ...   ...   ...    ...
995     5     1     5     5     3     5     3     5     5      5
996     4     5     3     3     3     4     4     4     5      3
997     4     5     3     5     3     1     1     1     5      5
998     4     4     4     4     3     5     5     4     5      5
999     4     4     4     4     3     5     5     4     4      4

[1000 rows x 10 columns]
CSN1     0
CSN2     0
CSN3     0
CSN4     0
CSN5     0
CSN6     0
CSN7     0
CSN8     0
CSN9     0
CSN10    0
dtype: int64


In [9]:
# First, we must get our data's standard deviation, skewness, and kurtosis

## Standard Deviation ##
std = pd.DataFrame(Data.std())
std = std.T

## Skewness ##
skewdf = pd.DataFrame(skew(Data, axis=0, bias=False, nan_policy='omit'))
skewdf = skewdf.T
skewdf = pd.DataFrame(data=skewdf.values, columns=Data.columns)

## Kurtosis ##
kurtosisdf = pd.DataFrame(kurtosis(Data, axis=0, bias=False, nan_policy='omit'))
kurtosisdf = kurtosisdf.T
kurtosisdf = pd.DataFrame(data=kurtosisdf.values, columns=Data.columns)

# Note: This uses Fisher's definition of Kurtosis where normal kurtosis = 0

In [10]:
## OPTIONAL: Inclusion of Subject Matter Expert (SME) ratings ##

# SME ratings can substantially help our alogirthm return items that accurately fit the intended construct we want to measure
# and have desirable item-level properties. 
# This is especially important for newly developed measures (e.g., organizational culture or engagement survey)

#SME = pd.read_csv(r'C:\XXX insert own filepath here)
#SME = SME.T
#SME.columns=Data.columns

# Note: dependent on your SME rating file, you will need to make modificatons
# For the SME dataframe to merge with the other dataframes, it must have the same column names (see below)

In [11]:
# Merge together these different dataframes

mergeddata = pd.concat([std, skewdf, kurtosisdf], axis=0)
mergeddata.index = ['STD', 'Skew', "Kurtosis"]
mergeddata = mergeddata.T
mergeddata

Unnamed: 0,STD,Skew,Kurtosis
CSN1,1.19996,-0.432774,-0.482622
CSN2,1.349369,0.029942,-1.172106
CSN3,1.03832,-0.986672,0.731934
CSN4,1.239335,-0.296732,-0.837663
CSN5,1.28072,0.33523,-0.880653
CSN6,1.38098,-0.077621,-1.238216
CSN7,1.128172,-0.711126,0.029046
CSN8,1.144019,-0.387547,-0.322214
CSN9,1.272912,-0.205267,-0.938101
CSN10,0.998184,-0.380623,-0.069857


In [12]:
# To make our calculations easier, we take the absolute values of skewness and kurtosis

mergeddata['Skew'] = mergeddata['Skew'].abs()
mergeddata['Kurtosis'] = mergeddata['Kurtosis'].abs()
mergeddata

Unnamed: 0,STD,Skew,Kurtosis
CSN1,1.19996,0.432774,0.482622
CSN2,1.349369,0.029942,1.172106
CSN3,1.03832,0.986672,0.731934
CSN4,1.239335,0.296732,0.837663
CSN5,1.28072,0.33523,0.880653
CSN6,1.38098,0.077621,1.238216
CSN7,1.128172,0.711126,0.029046
CSN8,1.144019,0.387547,0.322214
CSN9,1.272912,0.205267,0.938101
CSN10,0.998184,0.380623,0.069857


In [13]:
# Create a decision matrix

dmat = skc.mkdm(
    mergeddata.values, objectives=[max, min, min],
    weights=[.33, .33, .33],
    alternatives=["it1", "it2", "it3", "it4", "it5", "it6", "it7", "it8", "it9", "it10"],
    criteria=["SD", "Skew", "Kurt"])

# Notes:
# 1. As is shown above, you must pass the values of your dataframe
# 2. Weights here are set to be equal across item statistics, however,
# there is could be a strong argument to weight SME ratings higher
# than SD, skewness, and kurtosis to ensure items reflect their intended constructs.

dmat

# Compare the DM matrix to your original mergeddata - should have exact same values

Unnamed: 0,SD[▲ 0.33],Skew[▼ 0.33],Kurt[▼ 0.33]
it1,1.19996,0.432774,0.482622
it2,1.349369,0.029942,1.172106
it3,1.03832,0.986672,0.731934
it4,1.239335,0.296732,0.837663
it5,1.28072,0.33523,0.880653
it6,1.38098,0.077621,1.238216
it7,1.128172,0.711126,0.029046
it8,1.144019,0.387547,0.322214
it9,1.272912,0.205267,0.938101
it10,0.998184,0.380623,0.069857


In [14]:
## FILTERS ##

from skcriteria.preprocessing import filters

# For standard deviation, we want some level of standard deviation to differentiate between individuals
# but items with extremely high values are unlikely to truly reflect our constructs. 
# Extremely high SD could indicate that participants are fundamentally different on that item
# and it does not accurately reflect the underlying construct (e.g., we assume X construct is normally distributed)

########################### SD FILTER ###########################
# For this, we apply a filter: to only view items with SD higher than .50 and lower than 1.50
# These ranges will shift based upon your likert scale options (e.g., 1-5, 1-7, 1-100)

## SD lower limit filter
SDLL = filters.FilterGE({"SD": 0.50})
SDLL

dmatSDLL = SDLL.transform(dmat)
dmatSDLL

## SD upper limit filter
SDUL = filters.FilterLT({"SD": 1.50})
dmatSDUL = SDUL.transform(dmatSDLL)
dmatSDUL

## Whenever it is your final filter applied, I suggest changing the name
dmatfinal = dmatSDUL
dmatfinal

# Similarly, for SME ratings (if used), we may only want to consider items that have an SME above the median of our scale.
# For example, we may set the filter to only consider items with SME ratings above 3 on a 5-point likert scale

########################### SME FILTER ###########################

# Values are not set to run because we don't have SME ratings
# To utilize this: simply remove the # and change the decision matrix input
# in the below sections

#SMEFILT = filters.FilterGE({"SME": 3.00})

#dmatfinal = SME.transform(dmatSDUL)
#dmatfinal

Unnamed: 0,SD[▲ 0.33],Skew[▼ 0.33],Kurt[▼ 0.33]
it1,1.19996,0.432774,0.482622
it2,1.349369,0.029942,1.172106
it3,1.03832,0.986672,0.731934
it4,1.239335,0.296732,0.837663
it5,1.28072,0.33523,0.880653
it6,1.38098,0.077621,1.238216
it7,1.128172,0.711126,0.029046
it8,1.144019,0.387547,0.322214
it9,1.272912,0.205267,0.938101
it10,0.998184,0.380623,0.069857


In [15]:
# skcriteria prefers to deal with maxmizing all criteria
# Here, we invert our skewness and kurtosis. Higher values will then be more desirable

from skcriteria.preprocessing import invert_objectives, scalers

inv = invert_objectives.InvertMinimize()
dmatfinal = inv.transform(dmatfinal)

# Now we scale each criteria into an easy to understand 0 to 1 index
# The closer to 1, the more desirable the item statistic

scaler = scalers.SumScaler(target="both")
dmatfinal = scaler.transform(dmatfinal)
dmatfinal

# As seen in printed output, each value now reflects itself 
# divided the by sum of the column (e.g., it1 had a SD of 1.199. The sum of all SD is 12.031. 
# 1.199/12.031 = .099 value of SD for it1)

Unnamed: 0,SD[▲ 0.333333],Skew[▲ 0.333333],Kurt[▲ 0.333333]
it1,0.099731,0.034261,0.034339
it2,0.112149,0.495193,0.014139
it3,0.086297,0.015028,0.022642
it4,0.103004,0.049968,0.019784
it5,0.106443,0.04423,0.018819
it6,0.114776,0.191022,0.013384
it7,0.093765,0.02085,0.570557
it8,0.095082,0.038259,0.051433
it9,0.105794,0.072234,0.017666
it10,0.082961,0.038955,0.237236


In [16]:
## Now we simply rank these items ##

from skcriteria.madm import simple
decision = simple.WeightedSumModel()
ranking = decision.evaluate(dmatfinal)
ranking

# Note: these ranks are based upon their summed values from our final
# decision-matrix (dmatfinal)

## Save the ranking of these items ##

Alternatives,it1,it2,it3,it4,it5,it6,it7,it8,it9,it10
Rank,9,2,10,7,8,4,1,6,5,3


In [17]:
## Save this data for step 2 ##

# remove # after inputting file path and name

#Data.to_csv(r'C:\Users\Trevo\Documents\IPIPScaleReductRandomData.csv')