# Data Wrangling

## Key concepts

 - Merging data frames
 - Miltering data frames
 - Manipulating rows and columns

### 1. Read dataset and examine data frames for duplicates(based on ID)

Importing the modules required

In [110]:
import pandas as pandas
import re
from functools import reduce
import operator

Reading in the personality scores

In [111]:
personality_dataframe = pandas.read_csv("../data/personality_scores.csv", delimiter=';')

Examine the data frame for duplicates (based on ID), and drop any duplicates that exist


In [112]:
examined_personality_dataframe = personality_dataframe.drop_duplicates(subset='ID')
examined_personality_dataframe.head()

Unnamed: 0,ID,Section 5 of 6 [I am always prepared.],Section 5 of 6 [I am easily disturbed.],Section 5 of 6 [I am exacting (demanding) in my work.],Section 5 of 6 [I am full of ideas.],Section 5 of 6 [I am interested in people.],Section 5 of 6 [I am not interested in abstract ideas.],Section 5 of 6 [I am not interested in other people's problems.],Section 5 of 6 [I am not really interested in others.],Section 5 of 6 [I am quick to understand things.],...,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,IPIP_HIGH_RISK
0,0,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 3)","(5, 3)","(2, 3)","(2, 5)","(5, 5)",...,,,,,,,,,,
1,1,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 5)","(5, 3)","(2, 5)","(2, 5)","(5, 5)",...,,,,,,,,,,
2,2,"(3, 5)","(4, 3)","(3, 3)","(5, 5)","(2, 5)","(5, 5)","(2, 5)","(2, 5)","(5, 5)",...,,,,,,,,,,
3,3,"(3, 5)","(4, 5)","(3, 3)","(5, 5)","(2, 5)","(5, 3)","(2, 3)","(2, 3)","(5, 3)",...,,,,,,,,,,
4,4,"(3, 3)","(4, 5)","(3, 3)","(5, 3)","(2, 3)","(5, 3)","(2, 3)","(2, 3)","(5, 5)",...,,,,,,,,,,


Asserting that the new data frame is the length of the unique entries of the original data frame


In [113]:
assert personality_dataframe['ID'].nunique() == len(examined_personality_dataframe['ID'])

Deleting unnamed columns and the IPIP_HIGH_RISK column since they do not contain any data

In [114]:
for trait in examined_personality_dataframe:
    if "Unnamed:" in trait:
        del examined_personality_dataframe[trait]
    elif "IPIP_HIGH_RISK" in trait:
        del examined_personality_dataframe[trait]

Setting the ID column as the index

In [115]:
examined_personality_dataframe = examined_personality_dataframe.set_index('ID')

Displaying the filtered dataframe

In [116]:
examined_personality_dataframe.head()

Unnamed: 0_level_0,Section 5 of 6 [I am always prepared.],Section 5 of 6 [I am easily disturbed.],Section 5 of 6 [I am exacting (demanding) in my work.],Section 5 of 6 [I am full of ideas.],Section 5 of 6 [I am interested in people.],Section 5 of 6 [I am not interested in abstract ideas.],Section 5 of 6 [I am not interested in other people's problems.],Section 5 of 6 [I am not really interested in others.],Section 5 of 6 [I am quick to understand things.],Section 5 of 6 [I am quiet around strangers.],...,Section 5 of 6 [I often forget to put things back in their proper place],Section 5 of 6 [I pay attention to details.],Section 5 of 6 [I seldom feel blue (down).],Section 5 of 6 [I spend time reflecting on things.],Section 5 of 6 [I start conversations.],Section 5 of 6 [I sympathize with others' feelings.],Section 5 of 6 [I take time out for others.],Section 5 of 6 [I talk to a lot of different people at parties.],Section 5 of 6 [I use difficult words.],Section 5 of 6 [I worry about things.]
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 3)","(5, 3)","(2, 3)","(2, 5)","(5, 5)","(1, 3)",...,"(3, 5)","(3, 5)","(4, 3)","(5, 5)","(1, 3)","(2, 5)","(2, 5)","(1, 3)","(5, 1)","(4, 3)"
1,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 5)","(5, 3)","(2, 5)","(2, 5)","(5, 5)","(1, 3)",...,"(3, 5)","(3, 1)","(4, 1)","(5, 5)","(1, 5)","(2, 5)","(2, 5)","(1, 5)","(5, 3)","(4, 3)"
2,"(3, 5)","(4, 3)","(3, 3)","(5, 5)","(2, 5)","(5, 5)","(2, 5)","(2, 5)","(5, 5)","(1, 1)",...,"(3, 5)","(3, 5)","(4, 1)","(5, 3)","(1, 3)","(2, 5)","(2, 5)","(1, 3)","(5, 1)","(4, 3)"
3,"(3, 5)","(4, 5)","(3, 3)","(5, 5)","(2, 5)","(5, 3)","(2, 3)","(2, 3)","(5, 3)","(1, 3)",...,"(3, 1)","(3, 5)","(4, 1)","(5, 5)","(1, 5)","(2, 5)","(2, 5)","(1, 5)","(5, 1)","(4, 1)"
4,"(3, 3)","(4, 5)","(3, 3)","(5, 3)","(2, 3)","(5, 3)","(2, 3)","(2, 3)","(5, 5)","(1, 1)",...,"(3, 5)","(3, 5)","(4, 5)","(5, 5)","(1, 3)","(2, 3)","(2, 5)","(1, 3)","(5, 1)","(4, 3)"


### 2. Create new columns containing the total score of each of the personality test subscales by writing functions that will calculate the total score for each of the subscales

The Scores in the data are saved as tuples

Extracting the first value from the tuple, which indicates the subscale to which the item belongs (1 = Extraversion, 2 = Agreeableness, 3 = Conscientiousness, 4 = Emotional Stability/Neuroticism, and 5 = Intellect/Imagination / openness to experiences)

In [117]:
def subscale(data_frame):
    sub = data_frame.str[1]
    return sub

In [118]:
subscale_dataframe = examined_personality_dataframe.apply(subscale,axis = 1)
subscale_dataframe= subscale_dataframe.astype(int)
subscale_dataframe.head()

Unnamed: 0_level_0,Section 5 of 6 [I am always prepared.],Section 5 of 6 [I am easily disturbed.],Section 5 of 6 [I am exacting (demanding) in my work.],Section 5 of 6 [I am full of ideas.],Section 5 of 6 [I am interested in people.],Section 5 of 6 [I am not interested in abstract ideas.],Section 5 of 6 [I am not interested in other people's problems.],Section 5 of 6 [I am not really interested in others.],Section 5 of 6 [I am quick to understand things.],Section 5 of 6 [I am quiet around strangers.],...,Section 5 of 6 [I often forget to put things back in their proper place],Section 5 of 6 [I pay attention to details.],Section 5 of 6 [I seldom feel blue (down).],Section 5 of 6 [I spend time reflecting on things.],Section 5 of 6 [I start conversations.],Section 5 of 6 [I sympathize with others' feelings.],Section 5 of 6 [I take time out for others.],Section 5 of 6 [I talk to a lot of different people at parties.],Section 5 of 6 [I use difficult words.],Section 5 of 6 [I worry about things.]
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,4,3,5,2,5,2,2,5,1,...,3,3,4,5,1,2,2,1,5,4
1,3,4,3,5,2,5,2,2,5,1,...,3,3,4,5,1,2,2,1,5,4
2,3,4,3,5,2,5,2,2,5,1,...,3,3,4,5,1,2,2,1,5,4
3,3,4,3,5,2,5,2,2,5,1,...,3,3,4,5,1,2,2,1,5,4
4,3,4,3,5,2,5,2,2,5,1,...,3,3,4,5,1,2,2,1,5,4


Extracting the second value from the tuple, which indicates the individual's scored response to that question (i.e., 1=Disagree, 3=Neutral and 5=Agree)

In [119]:
def scored_response(dataframe):
    response = dataframe.str[4]
    return response

In [120]:

scored_response_dataframe = examined_personality_dataframe.apply(scored_response, axis = 1)
scored_response_dataframe = scored_response_dataframe.astype(int)
scored_response_dataframe.head()

Unnamed: 0_level_0,Section 5 of 6 [I am always prepared.],Section 5 of 6 [I am easily disturbed.],Section 5 of 6 [I am exacting (demanding) in my work.],Section 5 of 6 [I am full of ideas.],Section 5 of 6 [I am interested in people.],Section 5 of 6 [I am not interested in abstract ideas.],Section 5 of 6 [I am not interested in other people's problems.],Section 5 of 6 [I am not really interested in others.],Section 5 of 6 [I am quick to understand things.],Section 5 of 6 [I am quiet around strangers.],...,Section 5 of 6 [I often forget to put things back in their proper place],Section 5 of 6 [I pay attention to details.],Section 5 of 6 [I seldom feel blue (down).],Section 5 of 6 [I spend time reflecting on things.],Section 5 of 6 [I start conversations.],Section 5 of 6 [I sympathize with others' feelings.],Section 5 of 6 [I take time out for others.],Section 5 of 6 [I talk to a lot of different people at parties.],Section 5 of 6 [I use difficult words.],Section 5 of 6 [I worry about things.]
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5,5,5,5,3,3,3,5,5,3,...,5,5,3,5,3,5,5,3,1,3
1,5,5,5,5,5,3,5,5,5,3,...,5,1,1,5,5,5,5,5,3,3
2,5,3,3,5,5,5,5,5,5,1,...,5,5,1,3,3,5,5,3,1,3
3,5,5,3,5,5,3,3,3,3,3,...,1,5,1,5,5,5,5,5,1,1
4,3,5,3,3,3,3,3,3,5,1,...,5,5,5,5,3,3,5,3,1,3


calculating the total score for each of the subscales

In [121]:
personality_subscales = ["Extraversion", "Agreeableness", "Conscientiousness", "Neuroticism", "Openness"]
subscales = pandas.DataFrame()

In [122]:
def subscale_sum(subscale, value):
    sub_1 = int(subscale[1])
    sub_2 = int(subscale[4])
    if (value == 'Extraversion') and ( sub_1 == 1):
        return sub_2
    elif (value == 'Agreeableness') and ( sub_1 == 2):
        return sub_2
    elif (value == 'Conscientiousness') and ( sub_1 == 3):
        return sub_2
    elif (value == 'Emotional_Stability') and ( sub_1 == 4):
        return sub_2
    elif (value == 'Openness') and ( sub_1 == 5):
        return sub_2
    else:
        return 0

In [123]:
def subscale_total(dataframe,subscale):
    personality_traits = []
    for i in range(len(dataframe)):
        personality_traits.append(subscale)
    sub_total = list(map(subscale_sum, dataframe,  personality_traits))
    total = reduce(operator.add, sub_total)
    return total

In [124]:
for j in personality_subscales:
    traits = []
    for k in range(len(examined_personality_dataframe)):
        traits.append(k)
    subscales[k] = list(map(subscale_total, examined_personality_dataframe.values, traits))