# DATA WRANGLING PROJECT

## Question 1

Import the dataset personality_scores.csv. Examine the data frame for duplicates (based on ID), and drop any duplicates that exist. Use an assert statement to check that the new data frame is the length of the unique entries of the original data frame.

In [1]:
# load module
import pandas as pd
import re

In [2]:
# import data set
data = pd.read_csv("personality_scores.csv", sep=';')


In [3]:
# checking for missing values
data.isnull().sum()

ID                                                           0
Section 5 of 6 [I am always prepared.]                       0
Section 5 of 6 [I am easily disturbed.]                      0
Section 5 of 6 [I am exacting (demanding) in my work.]       0
Section 5 of 6 [I am full of ideas.]                         0
                                                          ... 
Unnamed: 65                                               1555
Unnamed: 66                                               1555
Unnamed: 67                                               1555
Unnamed: 68                                               1555
IPIP_HIGH_RISK                                            1555
Length: 70, dtype: int64

It looks like there is no missing values in the dataset

In [4]:
# creating data frame
df = pd.DataFrame(data)

In [5]:
# inspecting duplicate values
data.nunique()

ID                                                        1555
Section 5 of 6 [I am always prepared.]                       3
Section 5 of 6 [I am easily disturbed.]                      3
Section 5 of 6 [I am exacting (demanding) in my work.]       3
Section 5 of 6 [I am full of ideas.]                         3
                                                          ... 
Unnamed: 65                                                  0
Unnamed: 66                                                  0
Unnamed: 67                                                  0
Unnamed: 68                                                  0
IPIP_HIGH_RISK                                               0
Length: 70, dtype: int64

In [6]:
# drop duplicate rows and null values
 
df1 = data.drop_duplicates(subset="ID", keep="first").dropna(axis=1)
df1

Unnamed: 0,ID,Section 5 of 6 [I am always prepared.],Section 5 of 6 [I am easily disturbed.],Section 5 of 6 [I am exacting (demanding) in my work.],Section 5 of 6 [I am full of ideas.],Section 5 of 6 [I am interested in people.],Section 5 of 6 [I am not interested in abstract ideas.],Section 5 of 6 [I am not interested in other people's problems.],Section 5 of 6 [I am not really interested in others.],Section 5 of 6 [I am quick to understand things.],...,Section 5 of 6 [I often forget to put things back in their proper place],Section 5 of 6 [I pay attention to details.],Section 5 of 6 [I seldom feel blue (down).],Section 5 of 6 [I spend time reflecting on things.],Section 5 of 6 [I start conversations.],Section 5 of 6 [I sympathize with others' feelings.],Section 5 of 6 [I take time out for others.],Section 5 of 6 [I talk to a lot of different people at parties.],Section 5 of 6 [I use difficult words.],Section 5 of 6 [I worry about things.]
0,0,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 3)","(5, 3)","(2, 3)","(2, 5)","(5, 5)",...,"(3, 5)","(3, 5)","(4, 3)","(5, 5)","(1, 3)","(2, 5)","(2, 5)","(1, 3)","(5, 1)","(4, 3)"
1,1,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 5)","(5, 3)","(2, 5)","(2, 5)","(5, 5)",...,"(3, 5)","(3, 1)","(4, 1)","(5, 5)","(1, 5)","(2, 5)","(2, 5)","(1, 5)","(5, 3)","(4, 3)"
2,2,"(3, 5)","(4, 3)","(3, 3)","(5, 5)","(2, 5)","(5, 5)","(2, 5)","(2, 5)","(5, 5)",...,"(3, 5)","(3, 5)","(4, 1)","(5, 3)","(1, 3)","(2, 5)","(2, 5)","(1, 3)","(5, 1)","(4, 3)"
3,3,"(3, 5)","(4, 5)","(3, 3)","(5, 5)","(2, 5)","(5, 3)","(2, 3)","(2, 3)","(5, 3)",...,"(3, 1)","(3, 5)","(4, 1)","(5, 5)","(1, 5)","(2, 5)","(2, 5)","(1, 5)","(5, 1)","(4, 1)"
4,4,"(3, 3)","(4, 5)","(3, 3)","(5, 3)","(2, 3)","(5, 3)","(2, 3)","(2, 3)","(5, 5)",...,"(3, 5)","(3, 5)","(4, 5)","(5, 5)","(1, 3)","(2, 3)","(2, 5)","(1, 3)","(5, 1)","(4, 3)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1550,1550,"(3, 5)","(4, 5)","(3, 1)","(5, 5)","(2, 5)","(5, 5)","(2, 5)","(2, 5)","(5, 5)",...,"(3, 1)","(3, 5)","(4, 1)","(5, 3)","(1, 5)","(2, 5)","(2, 3)","(1, 1)","(5, 1)","(4, 5)"
1551,1551,"(3, 3)","(4, 5)","(3, 5)","(5, 3)","(2, 5)","(5, 3)","(2, 3)","(2, 5)","(5, 5)",...,"(3, 3)","(3, 3)","(4, 1)","(5, 3)","(1, 3)","(2, 5)","(2, 5)","(1, 5)","(5, 1)","(4, 3)"
1552,1552,"(3, 5)","(4, 3)","(3, 5)","(5, 5)","(2, 5)","(5, 5)","(2, 3)","(2, 3)","(5, 5)",...,"(3, 3)","(3, 5)","(4, 5)","(5, 5)","(1, 5)","(2, 5)","(2, 5)","(1, 5)","(5, 3)","(4, 3)"
1553,1553,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 5)","(5, 5)","(2, 5)","(2, 5)","(5, 5)",...,"(3, 5)","(3, 5)","(4, 1)","(5, 5)","(1, 5)","(2, 3)","(2, 3)","(1, 1)","(5, 1)","(4, 3)"


In [7]:
df1 = df1.dropna(axis=1, how='all').set_index('ID')
df1.head()

Unnamed: 0_level_0,Section 5 of 6 [I am always prepared.],Section 5 of 6 [I am easily disturbed.],Section 5 of 6 [I am exacting (demanding) in my work.],Section 5 of 6 [I am full of ideas.],Section 5 of 6 [I am interested in people.],Section 5 of 6 [I am not interested in abstract ideas.],Section 5 of 6 [I am not interested in other people's problems.],Section 5 of 6 [I am not really interested in others.],Section 5 of 6 [I am quick to understand things.],Section 5 of 6 [I am quiet around strangers.],...,Section 5 of 6 [I often forget to put things back in their proper place],Section 5 of 6 [I pay attention to details.],Section 5 of 6 [I seldom feel blue (down).],Section 5 of 6 [I spend time reflecting on things.],Section 5 of 6 [I start conversations.],Section 5 of 6 [I sympathize with others' feelings.],Section 5 of 6 [I take time out for others.],Section 5 of 6 [I talk to a lot of different people at parties.],Section 5 of 6 [I use difficult words.],Section 5 of 6 [I worry about things.]
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 3)","(5, 3)","(2, 3)","(2, 5)","(5, 5)","(1, 3)",...,"(3, 5)","(3, 5)","(4, 3)","(5, 5)","(1, 3)","(2, 5)","(2, 5)","(1, 3)","(5, 1)","(4, 3)"
1,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 5)","(5, 3)","(2, 5)","(2, 5)","(5, 5)","(1, 3)",...,"(3, 5)","(3, 1)","(4, 1)","(5, 5)","(1, 5)","(2, 5)","(2, 5)","(1, 5)","(5, 3)","(4, 3)"
2,"(3, 5)","(4, 3)","(3, 3)","(5, 5)","(2, 5)","(5, 5)","(2, 5)","(2, 5)","(5, 5)","(1, 1)",...,"(3, 5)","(3, 5)","(4, 1)","(5, 3)","(1, 3)","(2, 5)","(2, 5)","(1, 3)","(5, 1)","(4, 3)"
3,"(3, 5)","(4, 5)","(3, 3)","(5, 5)","(2, 5)","(5, 3)","(2, 3)","(2, 3)","(5, 3)","(1, 3)",...,"(3, 1)","(3, 5)","(4, 1)","(5, 5)","(1, 5)","(2, 5)","(2, 5)","(1, 5)","(5, 1)","(4, 1)"
4,"(3, 3)","(4, 5)","(3, 3)","(5, 3)","(2, 3)","(5, 3)","(2, 3)","(2, 3)","(5, 5)","(1, 1)",...,"(3, 5)","(3, 5)","(4, 5)","(5, 5)","(1, 3)","(2, 3)","(2, 5)","(1, 3)","(5, 1)","(4, 3)"


In [8]:
def score(entry, trait):    
    '''Find  the string of 2 values in an entry and regarding
    them as 2 columns. The score function does this by
    finding the digits in the string 
    and splitting them then regard them
    as columns._r finds the digits in the
    entry of this subcolumn'''
    
    numbers=0
    _r= re.compile('\d+')
    for col in entry:
        num= _r.findall(col)
        if str(trait)==num[0]:
            numbers+= int(num[1])
        else:
            numbers= numbers
    return numbers
factor= dict({
    1 : 'extraversion', 
    2 : 'agreeableness',
    3 : 'conscientiousness', 
    4 : 'emotional stability',
    5 : 'intellect'})
    
for i in factor.keys():
    df1[f'{factor[i]}']= df1.loc[:,'Section 5 of 6 [I am always prepared.]':'Section 5 of 6 [I worry about things.]'].apply(score,args=(i,), axis=1)

In [9]:
df1.head()

Unnamed: 0_level_0,Section 5 of 6 [I am always prepared.],Section 5 of 6 [I am easily disturbed.],Section 5 of 6 [I am exacting (demanding) in my work.],Section 5 of 6 [I am full of ideas.],Section 5 of 6 [I am interested in people.],Section 5 of 6 [I am not interested in abstract ideas.],Section 5 of 6 [I am not interested in other people's problems.],Section 5 of 6 [I am not really interested in others.],Section 5 of 6 [I am quick to understand things.],Section 5 of 6 [I am quiet around strangers.],...,Section 5 of 6 [I sympathize with others' feelings.],Section 5 of 6 [I take time out for others.],Section 5 of 6 [I talk to a lot of different people at parties.],Section 5 of 6 [I use difficult words.],Section 5 of 6 [I worry about things.],extraversion,agreeableness,conscientiousness,emotional stability,intellect
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 3)","(5, 3)","(2, 3)","(2, 5)","(5, 5)","(1, 3)",...,"(2, 5)","(2, 5)","(1, 3)","(5, 1)","(4, 3)",30,40,48,36,42
1,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 5)","(5, 3)","(2, 5)","(2, 5)","(5, 5)","(1, 3)",...,"(2, 5)","(2, 5)","(1, 5)","(5, 3)","(4, 3)",42,46,46,40,42
2,"(3, 5)","(4, 3)","(3, 3)","(5, 5)","(2, 5)","(5, 5)","(2, 5)","(2, 5)","(5, 5)","(1, 1)",...,"(2, 5)","(2, 5)","(1, 3)","(5, 1)","(4, 3)",28,40,40,38,42
3,"(3, 5)","(4, 5)","(3, 3)","(5, 5)","(2, 5)","(5, 3)","(2, 3)","(2, 3)","(5, 3)","(1, 3)",...,"(2, 5)","(2, 5)","(1, 5)","(5, 1)","(4, 1)",30,38,38,40,38
4,"(3, 3)","(4, 5)","(3, 3)","(5, 3)","(2, 3)","(5, 3)","(2, 3)","(2, 3)","(5, 5)","(1, 1)",...,"(2, 3)","(2, 5)","(1, 3)","(5, 1)","(4, 3)",28,34,46,38,36


In [10]:
#s = df1['Section 5 of 6 [I am always prepared.]'].str[1]
#df3 = df1.join(pd.DataFrame(columns=['Category', 'Score', 'Extraversion', 'Agreeableness', 'Conscientiousness', 'Emotional_Stability', 'Intellect']))
#df3

In [12]:
def score(entry, trait):    
    '''Find  the string of 2 values in an entry and regarding
    them as 2 columns. The score function does this by
    finding the digits in the string 
    and splitting them then regard them
    as columns._r finds the digits in the
    entry of this subcolumn'''
    
    numbers_extra=0
    #numbers_agree=0
    #numbers_consc=0
    #numbers_emo=0
    #numbers_intel=0
    
    _r= re.compile('\d+')
    for col in entry:
        num= _r.findall(col)
        if num == 1:
            numbers_extra += int(num[1])
        #elif num == 2:
            #numbers_agree += int(num[1])
        #elif num == 3:
            #numbers_consc += int(num[1])
        #elif num == 4:
            #numbers_emo += int(num[1])
        #elif num == 5:
            #numbers_intel += int(num[1])
        else:
            pass
    return numbers_extra
    #return numbers_agree
    #return numbers_consc
    #return numbers_emo
    #return numbers_intel
    
for i in df1['extraversion']:
    df1['extraversion']=df1.fillna(df1['extraversion'].apply(score, args=(i,), axis=1))

TypeError: score() got an unexpected keyword argument 'axis'

In [None]:
    
for i in factor.keys():
    df1[f'{factor[i]}']= df1.loc[:,'Section 5 of 6 [I am always prepared.]':'Section 5 of 6 [I worry about things.]'].apply(score,args=(i,), axis=1)