# Data Cleaning
- Will consist of creating a pipeline for Natural Language Processing utilizing different methods

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import databricks.koalas as ks

# Always make it pretty.
plt.style.use('ggplot')

In [2]:
raw = pd.read_csv('data/mbti_1 2.csv')

In [3]:
raw.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
raw['posts'].apply(lambda x: len(x.split(' '))).sum()

10953740

In [5]:
from personality import clean_df

In [6]:
# words = pipeline(raw)

In [7]:
# words.to_pickle('words_df.pkl')
words= pd.read_pickle('data/words_df.pkl')
words.head()

Unnamed: 0,type,posts
0,INFJ,"[life-chang, experi, life, may, perc, experi, ..."
1,ENTP,"['m, find, lack, post, alarm, sex, bore, 's, p..."
2,INTP,"[cours, say, know, 's, bless, curs, absolut, p..."
3,INTJ,"[dear, intp, enjoy, convers, day, esoter, gab,..."
4,ENTJ,"[you, re, fire, 's, anoth, silli, misconcept, ..."


In [8]:
# sentences = pipeline(raw)
# sentences.posts = sentences.posts.apply(lambda x: ' '.join(x))
# sentences.to_pickle('sentences.pkl')

In [9]:
sentences = pd.read_pickle('data/sentences.pkl')
sentences.head()

Unnamed: 0,type,posts
0,INFJ,what has been the most life-chang experi in yo...
1,ENTP,i 'm find the lack of me in these post veri al...
2,INTP,of cours to which i say i know that 's my bles...
3,INTJ,dear intp i enjoy our convers the other day es...
4,ENTJ,you re fire that 's anoth silli misconcept tha...


In [10]:
sentences['posts'].apply(lambda x: len(x.split(' '))).sum()

11465801

In [11]:
# sentences.to_pickle('sentences.pkl')

In [12]:
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer


def text_emotion(df, column):
    '''
    Takes a DataFrame and a specified column of text and adds 10 columns to the
    DataFrame for each of the 10 emotions in the NRC Emotion Lexicon, with each
    column containing the value of the text in that emotions
    INPUT: DataFrame, string
    OUTPUT: the original DataFrame with ten new columns
    '''

    new_df = df.copy()

    filepath = ('Data/'
                'NRC-Sentiment-Emotion-Lexicons/'
                'NRC-Sentiment-Emotion-Lexicons/'
                'NRC-Emotion-Lexicon-v0.92/'
                'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
    emolex_df = pd.read_csv(filepath,
                            names=["word", "emotion", "association"],
                            sep='\t')
    emolex_words = emolex_df.pivot(index='word',
                                   columns='emotion',
                                   values='association').reset_index()
    emotions = emolex_words.columns.drop('word')
    emo_df = pd.DataFrame(0, index=df.index, columns=emotions)

    stemmer = SnowballStemmer("english")
    
    for i,row in sentences.iterrows():
        print(i)
        document = word_tokenize(new_df.loc[i][column])
        for word in document:
                word = stemmer.stem(word)
                emo_score = emolex_words[emolex_words.word == word]
                if not emo_score.empty:
                    for emotion in list(emotions):
                        emo_df.at[i, emotion] += emo_score[emotion]

    new_df = pd.concat([new_df, emo_df], axis=1)

    return new_df

In [13]:
# emolex_df.head()
# emolex_words.head()
# emotions = emolex_words.columns.drop('word')
# emo_df = pd.DataFrame(0, index=sentences.index, columns=emotions)

In [14]:
# for i,row in sentences.iterrows():
#     print(i)
#     document = word_tokenize(sentences.loc[i]['posts'])
#     for word in document:
#             word = stemmer.stem(word)
#             emo_score = emolex_words[emolex_words.word == word]
#             if not emo_score.empty:
#                 for emotion in list(emotions):
#                     emo_df.at[i, emotion] += emo_score[emotion]

In [15]:
# emo_df.to_pickle('emo_df.pkl')

In [18]:
sentences = clean_df(raw)

In [22]:
sentences['posts'].apply(lambda x: len(x.split(' '))).sum()

5459715

In [21]:
# sentences.to_pickle('sent2.pkl')