In [2]:
import pandas as pd

In [6]:
####@author:gargla

from bs4 import BeautifulSoup  
from collections import Counter
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, join

def cleanup_text(text):
    step1Cleaned = BeautifulSoup(text,"lxml")
    import re
    letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      step1Cleaned.get_text() )  # The text to search
    lower_case = letters_only.lower()        
    words = lower_case.split()
    words = [w for w in words if not w in stopwords.words("english")]
    return words

##get LIWC features for given word tokens
def get_LIWC_features(words,categories):
    feature_vector={}
    for key,val in categories.items(): 
        itemcounter = {item:count for item, count in [(item, words.count(item)) for item in set(val)]}
        feature_vector[key]=sum(itemcounter.values())
    return feature_vector


##read category files   
def read_LIWC_categories():
    ##TODO:move path to func param
    path="english_dictionary"
    csvs = [f for f in listdir(path) if f.endswith('.csv')]
    categories={}
    for csv in csvs:
        CatList=[]
        with open(path+"/"+csv, 'r') as f:
            for line in f.readlines():
                l= line.strip().split(',')
                CatList.extend(l)
        categories[csv]=CatList
    return categories

def get_data_feature_vector(ResponseDataset):
    columns = ResponseDataset.columns.values
    categories = read_LIWC_categories()
    #print columns
    count = 0
    feature_vector = {}
    column_ignore_list=["id","batch"]
    ##TODO:add exception handling insted
    user_ignore_list=["usmt12","usmt34","nan","usmt37","ukbr1",
                      "ukbr7","ukbr25","ukbr41","ukbr73","us12","us15",
                      "us39","p3","p9","p18","p35","p41","p46","p53","p62",
                      "p69","p72","p76","p91","j6","j9","j10","j17","j25",
                      "j26","j30","j36","j42","j43","j45","j75","j90","j111",
                     "j113","j115","j117","j118","j120","j124","j126","j127","j128","j130"]
    for row in range(ResponseDataset.shape[0]):
    #iterate over each row to get categories for every user response and followups
        userfeatureDict = {}
        user = str(ResponseDataset["id"][row])
        if user in user_ignore_list:
            continue
        print "getting features for id:",user
        for column in columns.tolist():
            if column in column_ignore_list:
                continue
            text = ResponseDataset[column][row]
            words = cleanup_text(text)
            #print words
            features = get_LIWC_features(words,categories)
            #print column,":",features
            userfeatureDict[str(column)]=features
        feature_vector[user] = userfeatureDict
    return feature_vector

#pd.DataFrame(featureDict.items(), columns=["text","featurecount"])
    
def create_feature_df(feature_matrix):
    user_ids = []
    frames = []
    for user_id, d in feature_matrix.iteritems():
        user_ids.append(user_id)
        frames.append(pd.DataFrame.from_dict(d, orient='index'))
    
    feature_df = pd.concat(frames, keys=user_ids)
    return feature_df

def processResponses(responsesFilePath):
    ResponseDf = pd.ExcelFile(responsesFilePath)
    ResponseDataset = ResponseDf.parse(0)
    feature_vector = get_data_feature_vector(ResponseDataset)
    feature_vector_df = create_feature_df(feature_vector)
    return feature_vector_df

In [7]:
def encodeBinary(x, threshold=5):
    
    """
    encodes value into a binary variable i.e. if value > threshold: return 1, else return 0
    Using this function to encode the entire Personality Dataframe
    
    Arguments: value, threshold(optional)<default=5>
    
    Output: binary value- 0 or 1
    
    """
    if(x>threshold):
        return 1
    else:
        return 0

In [8]:
def encodeDF(personalityFilePath):
    """
    encodes the personality score data into binary labels.  
    
    Arguments: path for the Personality Score csv file
    
    Output: Data frame with binary labels against each big 5 personality trait for each user
    
    """
    persData=pd.read_csv(personalityFilePath, header=0)
    persData=persData.iloc[:,[0, 131,132,133,134,135]]
    persData=persData[persData.Openness!="#NULL!"] #Removing rows with no Personality form responses
    persData[persData.columns[1:]]=persData[persData.columns[1:]].apply(pd.to_numeric)
    for i in persData.columns[1:]:
        persData[i]=persData[i].map(encodeBinary)
    return persData

In [9]:
responsesFilePath="data-qualitative responses across cultures_fordan.xlsx"
personalityFilePath="persData.csv"

In [10]:
liwcData=processResponses(responsesFilePath)
persData=encodeDF(personalityFilePath)

getting features for id: usmt1
getting features for id: usmt2




getting features for id: usmt3
getting features for id: usmt4
getting features for id: usmt5
getting features for id: usmt6
getting features for id: usmt7
getting features for id: usmt8
getting features for id: usmt9
getting features for id: usmt10
getting features for id: usmt11
getting features for id: usmt23
getting features for id: usmt24
getting features for id: usmt25
getting features for id: usmt28
getting features for id: usmt29
getting features for id: usmt30
getting features for id: usmt31
getting features for id: usmt32
getting features for id: usmt33
getting features for id: usmt35
getting features for id: usmt36
getting features for id: usmt38
getting features for id: usmt39
getting features for id: usmt40
getting features for id: usmt41
getting features for id: usmt42
getting features for id: usmt50
getting features for id: usmt51
getting features for id: usmt52
getting features for id: usmt53
getting features for id: usmt54
getting features for id: indiap1
getting featur

In [19]:
print persData.head(10)
print liwcData.head(0)

    id  Openness  Extraversion  Agreeableness  Conscientiousness  \
0  c10         1             1              0                  0   
1  c11         1             1              0                  1   
2  c12         1             0              0                  1   
3  c13         1             0              1                  1   
4  c14         1             0              0                  0   
5  c15         1             0              0                  0   
6  c16         1             1              0                  0   
7   c2         1             1              0                  0   
8   c3         1             0              0                  0   
9   c4         1             1              0                  1   

   EmotionalStability  
0                   1  
1                   0  
2                   0  
3                   1  
4                   0  
5                   0  
6                   0  
7                   0  
8                   1  
9          

In [13]:
# Merging LIWC and Personality data to create final dataset

finalData=pd.merge(liwcData,persData, left_on='id', right_on='id', how='left')

KeyError: 'id'

In [None]:
print finalData.head()