In [1]:
# importing packages
import numpy as np
import pandas as pd
import os
import re
import plotly.express as px


# For pos tagging
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# sentiment scoring
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.preprocessing import MinMaxScaler

In [2]:
df=pd.read_csv("D:\\data\\clean_data_1.csv")

In [3]:
# Check top 5 rows
df.head()

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts
0,INFJ,0,0,0,1,'When asked of the things you wish you did ear...,asked thing wish earlier find answer...
1,INFJ,0,0,0,1,'I love both and they are equally important to...,love equally important music window so...
2,INFJ,0,0,0,1,Really? You think implying that everyone who i...,really think implying everyone entrepreneur...
3,ENFJ,1,0,0,1,'Love is a crazy thing. Se is our best form ...,love crazy thing best form communicat...
4,INTP,0,0,1,0,'I am a physics undergrad with a computation e...,physic undergrad computation emphasis ...


In [4]:
# check info of data set
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8588 entries, 0 to 8587
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8588 non-null   object
 1   is_Extrovert  8588 non-null   int64 
 2   is_Sensing    8588 non-null   int64 
 3   is_Thinking   8588 non-null   int64 
 4   is_Judging    8588 non-null   int64 
 5   posts         8588 non-null   object
 6   clean_posts   8588 non-null   object
dtypes: int64(4), object(3)
memory usage: 469.8+ KB


In [5]:
# checking the number of rows and columns
df.shape

(8588, 7)

In [6]:
# check null values
df.isna().sum()

type            0
is_Extrovert    0
is_Sensing      0
is_Thinking     0
is_Judging      0
posts           0
clean_posts     0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,is_Extrovert,is_Sensing,is_Thinking,is_Judging
count,8588.0,8588.0,8588.0,8588.0
mean,0.230438,0.137983,0.459013,0.395901
std,0.421138,0.344902,0.498346,0.489072
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0


In [8]:
# See number of columns with numerical features
df.select_dtypes(include=np.number).columns.tolist()

['is_Extrovert', 'is_Sensing', 'is_Thinking', 'is_Judging']

In [9]:
# See number of columns with categorical features
df.select_dtypes(include=['object']).columns.tolist()

['type', 'posts', 'clean_posts']

## Sentiment Score

In [10]:
# polarity_scores method of SentimentIntensityAnalyzer object gives a sentiment dictionary which contains pos, neg, neu, and compound scores.
analyzer = SentimentIntensityAnalyzer()

nlp_sentiment_score = []

for post in df["clean_posts"]:
    score = analyzer.polarity_scores(post)
    nlp_sentiment_score.append(score)

In [11]:
print(nlp_sentiment_score)

[{'neg': 0.086, 'neu': 0.599, 'pos': 0.314, 'compound': 0.9996}, {'neg': 0.086, 'neu': 0.465, 'pos': 0.45, 'compound': 0.9999}, {'neg': 0.19, 'neu': 0.546, 'pos': 0.265, 'compound': 0.991}, {'neg': 0.168, 'neu': 0.505, 'pos': 0.327, 'compound': 0.9994}, {'neg': 0.11, 'neu': 0.572, 'pos': 0.318, 'compound': 0.9994}, {'neg': 0.146, 'neu': 0.587, 'pos': 0.267, 'compound': 0.9985}, {'neg': 0.163, 'neu': 0.576, 'pos': 0.261, 'compound': 0.9976}, {'neg': 0.099, 'neu': 0.555, 'pos': 0.346, 'compound': 0.9996}, {'neg': 0.181, 'neu': 0.52, 'pos': 0.299, 'compound': 0.9987}, {'neg': 0.126, 'neu': 0.651, 'pos': 0.223, 'compound': 0.9961}, {'neg': 0.152, 'neu': 0.576, 'pos': 0.272, 'compound': 0.9971}, {'neg': 0.095, 'neu': 0.674, 'pos': 0.231, 'compound': 0.9895}, {'neg': 0.042, 'neu': 0.659, 'pos': 0.299, 'compound': 0.9995}, {'neg': 0.132, 'neu': 0.625, 'pos': 0.243, 'compound': 0.9981}, {'neg': 0.118, 'neu': 0.658, 'pos': 0.224, 'compound': 0.9979}, {'neg': 0.153, 'neu': 0.542, 'pos': 0.305, '

In [12]:
# segregating the indiviual sentiment scores - compound, positive, negative and neutral
df['compound_sentiment']=[score['compound'] for score in nlp_sentiment_score]
df['pos_sentiment']=[score['pos'] for score in nlp_sentiment_score]
df['neg_sentiment']=[score['neg'] for score in nlp_sentiment_score]
df['neutral_sentiment']=[score['neu'] for score in nlp_sentiment_score]

In [13]:
# Convert negative values into positive values

scaler = MinMaxScaler()

df['pos_sentiment'] = scaler.fit_trsfanorm(np.array(df['pos_sentiment']).reshape(-1,1))
df['compound_sentiment']=scaler.fit_transform(np.array(df['compound_sentiment']).reshape(-1,1))
df['neg_sentiment']=scaler.fit_transform(np.array(df['neg_sentiment']).reshape(-1,1))
df['neutral_sentiment']=scaler.fit_transform(np.array(df['neutral_sentiment']).reshape(-1,1))

In [14]:
df.head()

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,neutral_sentiment
0,INFJ,0,0,0,1,'When asked of the things you wish you did ear...,asked thing wish earlier find answer...,0.9998,0.419786,0.134585,0.652505
1,INFJ,0,0,0,1,'I love both and they are equally important to...,love equally important music window so...,0.99995,0.601604,0.134585,0.506536
2,INFJ,0,0,0,1,Really? You think implying that everyone who i...,really think implying everyone entrepreneur...,0.995499,0.354278,0.29734,0.594771
3,ENFJ,1,0,0,1,'Love is a crazy thing. Se is our best form ...,love crazy thing best form communicat...,0.9997,0.437166,0.262911,0.550109
4,INTP,0,0,1,0,'I am a physics undergrad with a computation e...,physic undergrad computation emphasis ...,0.9997,0.425134,0.172144,0.623094


## POS TAGGING

**list of the tags, what they mean, and some examples:**
1. CC coordinating conjunction 
2. CD cardinal digit 
3. DT determiner 
4. EX existential there (like: “there is” … think of it like “there exists”) 
5. FW foreign word 
6. IN preposition/subordinating conjunction 
7. JJ adjective – ‘big’ 
8. JJR adjective, comparative – ‘bigger’ 
9. JJS adjective, superlative – ‘biggest’ 
10. LS list marker 1) 
11. MD modal – could, will 
12. NN noun, singular ‘- desk’ 
13. NNS noun plural – ‘desks’ 
14. NNP proper noun, singular – ‘Harrison’ 
15. NNPS proper noun, plural – ‘Americans’ 
16. PDT predeterminer – ‘all the kids’ 
17. POS possessive ending parent’s 
18. PRP personal pronoun –  I, he, she 
19. PRP possessive pronoun – my, his, hers 
20. RB adverb – very, silently, 
21. RBR adverb, comparative – better 
22. RBS adverb, superlative – best 
23. RP particle – give up 
24. TO – to go ‘to’ the store. 
25. UH interjection – errrrrrrrm 
26. VB verb, base form – take 
27. VBD verb, past tense – took 
28. VBG verb, gerund/present participle – taking 
29. VBN verb, past participle – taken 
30. VBP verb, sing. present, non-3d – take 
31. VBZ verb, 3rd person sing. present – takes 
32. WDT wh-determiner – which 
33. WP wh-pronoun – who, what 
34. WP$ possessive wh-pronoun, eg- whose 
35. WRB wh-adverb, eg- where, when

In [15]:
# creating tag_posts column that will have each post as a separate list in a row. tag_posts will be a list of 50 lists.

# replacing urls with domain name
df["tag_posts"] = df["posts"].str.replace(
    re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+)([\S])*"),
    lambda match: match.group(2),
)

# replacing ||| with space
df["tag_posts"] = [
    post for post in df["tag_posts"].str.split("\|\|\|")
]


In [16]:
# Word tokenizers is used to find the words and punctuation in a string
# Using a Tagger. Which is part-of-speech tagger or POS-tagger.


nltk.download('averaged_perceptron_tagger')
  
df["tagged_words"] = df["tag_posts"].apply(
    lambda x: [nltk.pos_tag(word_tokenize(line)) for line in x]
)


[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [WinError 10060] A connection attempt failed because
[nltk_data]     the connected party did not properly respond after a
[nltk_data]     period of time, or established connection failed
[nltk_data]     because connected host has failed to respond>


In [17]:

# creating list of unique POS tags
tag_set = set()

for i, data in df["tagged_words"].iteritems():
    for tup in data[0]:
        tag_set.add(tup[1])

tag_list = list(tag_set)

In [18]:
tag_set

{'#',
 '$',
 "''",
 '(',
 ')',
 ',',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``'}

In [19]:
# applying std and mean

def pos_cat(x, tag):
    return [len([y for y in line if y[1] == tag]) for line in x]


for col in tag_list:
    df["POS_" + col + "_mean"] = df["tagged_words"].apply(
        lambda x: np.mean(pos_cat(x, col))
    )
    df["POS_" + col + "_std"] = df["tagged_words"].apply(
        lambda x: np.std(pos_cat(x, col))
    )

In [20]:

# grouping pos tags based on stanford list
tags_dict = {
    "ADJ": ["JJ", "JJR", "JJS"],
    "ADP": ["EX", "TO"],
    "ADV": ["RB", "RBR", "RBS", "WRB"],
    "CONJ": ["CC", "IN"],
    "DET": ["DT", "PDT", "WDT"],
    "NOUN": ["NN", "NNS", "NNP", "NNPS"],
    "NUM": ["CD"],
    "PRT": ["RP"],
    "PRON": ["PRP", "PRP$", "WP", "WP$"],
    "VERB": ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
    ".": ["#", "$", "''", "(", ")", ",", ".", ":"],
    "X": ["FW", "LS", "UH"],
}

In [21]:
# Stanford POS tag stats

def stanford_tag(x, tag):
    tags_list = [len([y for y in line if y[1] in tags_dict[col]]) for line in x]
    return tags_list


for col in tags_dict.keys():
    df[col + "_avg"] = df["tagged_words"].apply(
        lambda x: np.median(stanford_tag(x, col))
    )


  df[col + "_avg"] = df["tagged_words"].apply(
  df[col + "_avg"] = df["tagged_words"].apply(
  df[col + "_avg"] = df["tagged_words"].apply(
  df[col + "_avg"] = df["tagged_words"].apply(
  df[col + "_avg"] = df["tagged_words"].apply(
  df[col + "_avg"] = df["tagged_words"].apply(
  df[col + "_avg"] = df["tagged_words"].apply(
  df[col + "_avg"] = df["tagged_words"].apply(
  df[col + "_avg"] = df["tagged_words"].apply(
  df[col + "_avg"] = df["tagged_words"].apply(


In [22]:
df.head()

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,ADV_avg,CONJ_avg,DET_avg,NOUN_avg,NUM_avg,PRT_avg,PRON_avg,VERB_avg,._avg,X_avg
0,INFJ,0,0,0,1,'When asked of the things you wish you did ear...,asked thing wish earlier find answer...,0.9998,0.419786,0.134585,...,4.0,5.0,3.0,6.0,0.0,0.0,4.0,8.0,5.0,0.0
1,INFJ,0,0,0,1,'I love both and they are equally important to...,love equally important music window so...,0.99995,0.601604,0.134585,...,3.0,5.0,2.0,5.0,0.0,0.0,5.0,8.0,3.0,0.0
2,INFJ,0,0,0,1,Really? You think implying that everyone who i...,really think implying everyone entrepreneur...,0.995499,0.354278,0.29734,...,2.0,3.0,2.0,5.0,0.0,0.0,4.0,7.0,4.0,0.0
3,ENFJ,1,0,0,1,'Love is a crazy thing. Se is our best form ...,love crazy thing best form communicat...,0.9997,0.437166,0.262911,...,3.0,4.0,2.5,6.0,0.0,0.0,5.0,7.5,5.0,0.0
4,INTP,0,0,1,0,'I am a physics undergrad with a computation e...,physic undergrad computation emphasis ...,0.9997,0.425134,0.172144,...,2.0,4.0,2.0,6.0,0.0,0.0,3.0,5.0,4.0,0.0


In [23]:
df.to_csv("D:\\data\\clean_data_2.csv",index=False)