In [1]:
import nltk


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [26]:
df = pd.read_csv("./dataset/Adulting1012016-2021.csv")
df1=df[['selftext','num_comments','score','title']]
df1=df1.dropna(subset=['selftext'])
#drop meaningless text
df1=df1[(df1.selftext!='[removed]') & (df1.selftext!='[deleted]')]
#drop duplicate
duplicate = df1.duplicated()
print('There are', duplicate.sum(), 'duplicated values.')
df1=df1.drop_duplicates()

df1

There are 3 duplicated values.


Unnamed: 0,selftext,num_comments,score,title
0,Is it weird for an adult to wear a high school...,2,1,state championship ring
2,You’re moving out. Great! Whether it’s from ho...,0,11,[101] Leases and Roommates 101
3,Hey! You just moved into your first apartment...,3,13,"[101]Kitchen 101: What You Really, Really Need..."
4,This is the first part of a series of posts I’...,3,11,[101]What to look at before renting your first...
6,This is a place to learn all of the simple thi...,0,9,Welcome to /r/adulting101!
...,...,...,...,...
279,"Hi all, I’m 19 and just scored my first time j...",6,1,Just for my first full time job! Is it worth c...
283,Hi all so this is my first post here and I’m a...,3,1,I (19M) just got my first full time job at 40+...
284,How does one go about getting their first apar...,4,1,Lacking in skills and need advice asap!
285,so just before the pandemic kicked into high g...,5,1,Moved in with BF’s family. Not sure what to do...


In [27]:
import re
def remove_other(x):
    x = re.sub("\$"," ", x) #remove $
    x = re.sub("https*\S+", " ", x) #remove url
    x = re.sub("\n",'', x)#remove newlines
    x = re.sub("\*",'',x)
    return x

In [28]:
df1['selftext']=df1['selftext'].apply(lambda x: remove_other(x))
df1['selftext'].iloc[0]

'Is it weird for an adult to wear a high school championship ring?'

In [29]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

#create a list of dictionaries
sia = SIA()
results = []

for line in df1['selftext']:
    pol_score = sia.polarity_scores(line)
    pol_score['selftext'] = line
    results.append(pol_score)

results[:3]

[{'neg': 0.116,
  'neu': 0.685,
  'pos': 0.199,
  'compound': 0.296,
  'selftext': 'Is it weird for an adult to wear a high school championship ring?'},
 {'neg': 0.088,
  'neu': 0.814,
  'pos': 0.098,
  'compound': 0.9594,
  'selftext': "You’re moving out. Great! Whether it’s from home or from the dorms into off campus housing there are few things you should know about renting an apartment or house. Basics The monthly cost to live not just the rental price. That is the baseline. The true cost is is rent + internet/cable + electricity + water + heat +...+…+… Your monthly budget should account for all of these expenses. Some properties include some or all utilities, others do not. Factor that in when comparing rental costs. You can ask the utility what the average monthly bill at your address was in previous years when you sign up to get an idea of the cost. Most landlords insist that all tenants sign the same lease. This makes you and your friends/frenemies/randos you find through Craig

In [30]:
#create a df to write in the results of sentiment analysis
sent_selftext = pd.DataFrame(results)
sent_selftext

Unnamed: 0,neg,neu,pos,compound,selftext
0,0.116,0.685,0.199,0.2960,Is it weird for an adult to wear a high school...
1,0.088,0.814,0.098,0.9594,You’re moving out. Great! Whether it’s from ho...
2,0.022,0.853,0.125,0.9983,Hey! You just moved into your first apartment...
3,0.026,0.884,0.089,0.9966,This is the first part of a series of posts I’...
4,0.057,0.841,0.102,0.9118,This is a place to learn all of the simple thi...
...,...,...,...,...,...
193,0.047,0.867,0.086,0.9495,"Hi all, I’m 19 and just scored my first time j..."
194,0.036,0.863,0.101,0.9691,Hi all so this is my first post here and I’m a...
195,0.068,0.684,0.248,0.7219,How does one go about getting their first apar...
196,0.054,0.883,0.063,0.7249,so just before the pandemic kicked into high g...


In [31]:
results = []

for line in df1['title']:
    pol_score = sia.polarity_scores(line)
    pol_score['title'] = line
    results.append(pol_score)

results[:3]

[{'neg': 0.0,
  'neu': 0.408,
  'pos': 0.592,
  'compound': 0.4404,
  'title': 'state championship ring'},
 {'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0,
  'compound': 0.0,
  'title': '[101] Leases and Roommates 101'},
 {'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0,
  'compound': 0.0,
  'title': '[101]Kitchen 101: What You Really, Really Need for Your First Apartment'}]

In [32]:
sent_title = pd.DataFrame(results)
sent_title

Unnamed: 0,neg,neu,pos,compound,title
0,0.000,0.408,0.592,0.4404,state championship ring
1,0.000,1.000,0.000,0.0000,[101] Leases and Roommates 101
2,0.000,1.000,0.000,0.0000,"[101]Kitchen 101: What You Really, Really Need..."
3,0.000,1.000,0.000,0.0000,[101]What to look at before renting your first...
4,0.000,0.378,0.622,0.5093,Welcome to /r/adulting101!
...,...,...,...,...,...
193,0.000,0.919,0.081,0.2942,Just for my first full time job! Is it worth c...
194,0.000,1.000,0.000,0.0000,I (19M) just got my first full time job at 40+...
195,0.000,1.000,0.000,0.0000,Lacking in skills and need advice asap!
196,0.131,0.869,0.000,-0.2411,Moved in with BF’s family. Not sure what to do...


In [33]:
#labeling the sentiment of selftext
sent_selftext['label']=0
sent_selftext['label'].loc[sent_selftext['compound']>0.2]=1
sent_selftext['label'].loc[sent_selftext['compound']< -0.2]=-1
sent_selftext

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,neg,neu,pos,compound,selftext,label
0,0.116,0.685,0.199,0.2960,Is it weird for an adult to wear a high school...,1
1,0.088,0.814,0.098,0.9594,You’re moving out. Great! Whether it’s from ho...,1
2,0.022,0.853,0.125,0.9983,Hey! You just moved into your first apartment...,1
3,0.026,0.884,0.089,0.9966,This is the first part of a series of posts I’...,1
4,0.057,0.841,0.102,0.9118,This is a place to learn all of the simple thi...,1
...,...,...,...,...,...,...
193,0.047,0.867,0.086,0.9495,"Hi all, I’m 19 and just scored my first time j...",1
194,0.036,0.863,0.101,0.9691,Hi all so this is my first post here and I’m a...,1
195,0.068,0.684,0.248,0.7219,How does one go about getting their first apar...,1
196,0.054,0.883,0.063,0.7249,so just before the pandemic kicked into high g...,1


In [34]:
sent_selftext.to_csv('sent_selftext_Adulting101.csv', mode='a', encoding='utf-8', index=False)

In [35]:
sent_title['label']=0
sent_title['label'].loc[sent_title['compound']>0.2]=1
sent_title['label'].loc[sent_title['compound']< -0.2]=-1
sent_title

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,neg,neu,pos,compound,title,label
0,0.000,0.408,0.592,0.4404,state championship ring,1
1,0.000,1.000,0.000,0.0000,[101] Leases and Roommates 101,0
2,0.000,1.000,0.000,0.0000,"[101]Kitchen 101: What You Really, Really Need...",0
3,0.000,1.000,0.000,0.0000,[101]What to look at before renting your first...,0
4,0.000,0.378,0.622,0.5093,Welcome to /r/adulting101!,1
...,...,...,...,...,...,...
193,0.000,0.919,0.081,0.2942,Just for my first full time job! Is it worth c...,1
194,0.000,1.000,0.000,0.0000,I (19M) just got my first full time job at 40+...,0
195,0.000,1.000,0.000,0.0000,Lacking in skills and need advice asap!,0
196,0.131,0.869,0.000,-0.2411,Moved in with BF’s family. Not sure what to do...,-1


In [36]:
sent_title.columns=['t_neg','t_neu','t_pos','t_compound','title','t_label']
sent_title

Unnamed: 0,t_neg,t_neu,t_pos,t_compound,title,t_label
0,0.000,0.408,0.592,0.4404,state championship ring,1
1,0.000,1.000,0.000,0.0000,[101] Leases and Roommates 101,0
2,0.000,1.000,0.000,0.0000,"[101]Kitchen 101: What You Really, Really Need...",0
3,0.000,1.000,0.000,0.0000,[101]What to look at before renting your first...,0
4,0.000,0.378,0.622,0.5093,Welcome to /r/adulting101!,1
...,...,...,...,...,...,...
193,0.000,0.919,0.081,0.2942,Just for my first full time job! Is it worth c...,1
194,0.000,1.000,0.000,0.0000,I (19M) just got my first full time job at 40+...,0
195,0.000,1.000,0.000,0.0000,Lacking in skills and need advice asap!,0
196,0.131,0.869,0.000,-0.2411,Moved in with BF’s family. Not sure what to do...,-1


In [37]:
sent_title.to_csv('sent_title_Adulting101.csv', mode='a', encoding='utf-8', index=False)

In [38]:
sent=sent_selftext.join(sent_title)
sent

Unnamed: 0,neg,neu,pos,compound,selftext,label,t_neg,t_neu,t_pos,t_compound,title,t_label
0,0.116,0.685,0.199,0.2960,Is it weird for an adult to wear a high school...,1,0.000,0.408,0.592,0.4404,state championship ring,1
1,0.088,0.814,0.098,0.9594,You’re moving out. Great! Whether it’s from ho...,1,0.000,1.000,0.000,0.0000,[101] Leases and Roommates 101,0
2,0.022,0.853,0.125,0.9983,Hey! You just moved into your first apartment...,1,0.000,1.000,0.000,0.0000,"[101]Kitchen 101: What You Really, Really Need...",0
3,0.026,0.884,0.089,0.9966,This is the first part of a series of posts I’...,1,0.000,1.000,0.000,0.0000,[101]What to look at before renting your first...,0
4,0.057,0.841,0.102,0.9118,This is a place to learn all of the simple thi...,1,0.000,0.378,0.622,0.5093,Welcome to /r/adulting101!,1
...,...,...,...,...,...,...,...,...,...,...,...,...
193,0.047,0.867,0.086,0.9495,"Hi all, I’m 19 and just scored my first time j...",1,0.000,0.919,0.081,0.2942,Just for my first full time job! Is it worth c...,1
194,0.036,0.863,0.101,0.9691,Hi all so this is my first post here and I’m a...,1,0.000,1.000,0.000,0.0000,I (19M) just got my first full time job at 40+...,0
195,0.068,0.684,0.248,0.7219,How does one go about getting their first apar...,1,0.000,1.000,0.000,0.0000,Lacking in skills and need advice asap!,0
196,0.054,0.883,0.063,0.7249,so just before the pandemic kicked into high g...,1,0.131,0.869,0.000,-0.2411,Moved in with BF’s family. Not sure what to do...,-1


In [39]:
sent[(sent['label']==1)&(sent['t_label']==-1)]

Unnamed: 0,neg,neu,pos,compound,selftext,label,t_neg,t_neu,t_pos,t_compound,title,t_label
32,0.125,0.732,0.143,0.8576,"If there is one thing you must know about me, ...",1,0.623,0.377,0.0,-0.5106,Failure at Adulting,-1
48,0.054,0.866,0.08,0.7804,"Copying and pasting from /r/advice.Hi, I'm liv...",1,0.161,0.839,0.0,-0.3182,18 and lost. Need to move out by the end of th...,-1
59,0.0,0.842,0.158,0.9657,I'm looking to start online college courses (w...,1,0.28,0.72,0.0,-0.5423,Would it be a bad idea to buy a laptop on paym...,-1
61,0.089,0.76,0.151,0.9366,I am 27 years old and I am officially on my ow...,1,0.25,0.75,0.0,-0.25,Living alone for the first time ever 😅,-1
107,0.087,0.731,0.182,0.9777,"Hello,The last 12 years I've loved a guy who's...",1,0.231,0.769,0.0,-0.2023,How does one fill an empty void?,-1
131,0.036,0.808,0.157,0.7574,I am a counseling psychology doctoral student ...,1,0.247,0.753,0.0,-0.3182,'Adulting' When the 'Struggle is Real' / disse...,-1
140,0.047,0.885,0.068,0.7636,So I (F19) have just been told by my mother th...,1,0.089,0.911,0.0,-0.3491,Just got yelled at for forgetting that the tag...,-1
142,0.069,0.823,0.108,0.2851,How much does it cost you to live alone in an ...,1,0.304,0.696,0.0,-0.5423,Moving out (of course after all this tragic st...,-1
196,0.054,0.883,0.063,0.7249,so just before the pandemic kicked into high g...,1,0.131,0.869,0.0,-0.2411,Moved in with BF’s family. Not sure what to do...,-1


In [40]:
sent[(sent['label']==-1)&(sent['t_label']==1)]

Unnamed: 0,neg,neu,pos,compound,selftext,label,t_neg,t_neu,t_pos,t_compound,title,t_label
23,0.149,0.762,0.089,-0.9991,Skim if you want but this includes info about ...,-1,0.0,0.69,0.31,0.4019,"I need help when I'm eighteen (""running away"")",1
38,0.077,0.857,0.066,-0.7387,There is a lot of confusion by both landlords ...,-1,0.0,0.676,0.324,0.34,[101] Renting - Security Deposits and Repairs,1
74,0.202,0.764,0.034,-0.9605,"I'm 27F and never had a credit card, but now I...",-1,0.0,0.0,1.0,0.4836,Credit???,1
77,0.132,0.782,0.086,-0.7705,Thank you for your time (in advance)Background...,-1,0.0,0.733,0.267,0.5081,18f Starting College...I Would Really Apprecia...,1
126,0.141,0.787,0.072,-0.8395,Hey! My husbands wisdom teeth are all coming i...,-1,0.0,0.588,0.412,0.743,How do I know what is good dental insurance? W...,1
143,0.147,0.782,0.071,-0.9081,"Hi, I'm 28, but I'm basically a grown up child...",-1,0.0,0.838,0.162,0.4019,[REQUEST] Looking for a Mentor who will help b...,1
145,0.197,0.803,0.0,-0.8338,One observation. But at least we will be able ...,-1,0.102,0.636,0.261,0.5436,CATCH 22: Once a vaccine is made it'll be grea...,1


In [41]:
print("Positive headlines:\n")
sent[sent['label'] == 1].index


Positive headlines:



Int64Index([  0,   1,   2,   3,   4,   5,   6,   8,  13,  14,
            ...
            184, 185, 186, 187, 188, 192, 193, 194, 195, 196],
           dtype='int64', length=123)

In [43]:
sent['selftext'].iloc[196]

'so just before the pandemic kicked into high gear, I went across to the other side of my state (3 hour drive from point A to point B) to visit my boyfriend and his parents for a bit.ended up being stuck here when the shelter in place order first dropped, and by the time travel was allowed again we had all come to the conclusion that, now that we’re basically in the wedding planning stage for (hopefully) next year, and with the virus making travel and work much more difficult for all of us for the foreseeable future, it makes more sense for me to stay with him and his parents permanently on this side of the state until then because even under normal circumstances it was a pain to make time for either of us to visit the other. anyway, so I’ve basically moved in with them. but before this I was living with my parents, and I’ve never lived on my own or moved in with anyone else. apart from packing up and bringing my most essential things with me (which we were able to do last month) I don

In [44]:
print(sent.label.value_counts())

print(sent.label.value_counts(normalize=True) * 100)

 1    123
-1     53
 0     22
Name: label, dtype: int64
 1    62.121212
-1    26.767677
 0    11.111111
Name: label, dtype: float64


In [45]:
print(sent.t_label.value_counts())

print(sent.t_label.value_counts(normalize=True) * 100)

 0    128
 1     46
-1     24
Name: t_label, dtype: int64
 0    64.646465
 1    23.232323
-1    12.121212
Name: t_label, dtype: float64


In [67]:
sent['selftext']=sent['selftext'].str.lower()
def remove_emoji(text):
    text = text.encode('ascii', 'ignore').decode()
    return text

sent['selftext']=sent['selftext'].apply(lambda x: remove_emoji(x))

import re
def remove_other(x):
    x = re.sub("\$"," ", x) #remove $
    x = re.sub("https*\S+", " ", x) #remove url
    x = re.sub("\'\w+", '', x) #remove i'm,we're,let's after the '
    x = re.sub("[0-9]+", '', x) #remove numbers
    x = re.sub("\n",'', x)#remove newlines
    return x

sent['selftext']=sent['selftext'].apply(lambda x: remove_other(x))

from nltk.corpus import stopwords
#nltk.download("stopwords") #uncomment it when run it for the first time
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

sent['selftext']=sent['selftext'].apply(lambda x: remove_stopwords(x))

spec_chars = ["!",'“','"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]
for char in spec_chars:
    sent['selftext'] = sent['selftext'].str.replace(char, ' ')
    sent['selftext'] = sent['selftext'].str.split().str.join(" ")
    
from nltk.stem import WordNetLemmatizer

def lemmatize(words):
    lemmatizer = WordNetLemmatizer()
    nwords=[]
    for word in words:
        word=lemmatizer.lemmatize(word)
        nwords.append(word)
    return ' '.join(nwords)

sent['selftext']= sent['selftext'].str.split().apply(lambda x: lemmatize(x))

stop_words.update({'would','k','im','could','also',
                   'amp','much','one','like','get',
                   'since','etc','got','always',
                   'know','thing','really','dont',
                   'find','even','go','time','need','want'
                  })
sent['selftext']=sent['selftext'].apply(lambda x: remove_stopwords(x))




In [72]:
import itertools
import collections
def count_words(text):
    all_words = list(itertools.chain(*text.str.split()))
    counts = collections.Counter(all_words)
    counts_df = pd.DataFrame(counts.most_common(100),
                            columns=['words', 'count'])

    return counts_df

#positive words distributions
count_words(sent['selftext'][sent['label']==1])

Unnamed: 0,words,count
0,car,91
1,job,78
2,make,75
3,money,67
4,place,64
...,...,...
95,around,19
96,experience,19
97,two,19
98,pretty,19


In [74]:
count_words(sent['selftext'][sent['label']==-1])

Unnamed: 0,words,count
0,pay,38
1,year,27
2,tenant,27
3,going,26
4,car,26
...,...,...
95,start,8
96,everything,8
97,away,8
98,looking,8


In [77]:
from nltk.util import ngrams
def count_2gram(text):
    all_words = list(itertools.chain(*text.str.split()))
    es2grams = ngrams(all_words, 2)
    counts = collections.Counter(es2grams)
    count_df = pd.DataFrame(counts.most_common(100),
                            columns=['words', 'count'])
    dictionary2 = [' '.join(tup) for tup in count_df.words]
    count_df.words=dictionary2

    return count_df

count_df2=count_2gram(sent['selftext'][sent['label']==1])
count_df3=count_2gram(sent['selftext'][sent['label']==-1])

#positive word distribution
count_df2

Unnamed: 0,words,count
0,credit card,17
1,make sure,14
2,year old,12
3,youre going,10
4,slow cooker,10
...,...,...
95,post guide,3
96,carfax report,3
97,driver license,3
98,take care,3


In [76]:
#Negative word distribution
count_df3

Unnamed: 0,words,count
0,credit card,17
1,make sure,14
2,year old,12
3,youre going,10
4,slow cooker,10
...,...,...
95,post guide,3
96,carfax report,3
97,driver license,3
98,take care,3


In [78]:
def count_3gram(text):
    all_words = list(itertools.chain(*text.str.split()))
    es3grams = ngrams(all_words, 3)
    counts = collections.Counter(es3grams)
    count_df = pd.DataFrame(counts.most_common(100),
                            columns=['words', 'count'])
    dictionary3 = [' '.join(tup) for tup in count_df.words]
    count_df.words=dictionary3

    return count_df


count_df4=count_3gram(sent['selftext'][sent['label']==1])
count_df5=count_3gram(sent['selftext'][sent['label']==-1])

#positive word distribution
count_df4

Unnamed: 0,words,count
0,basic life skill,4
1,please feel free,4
2,buy car insurance,4
3,psychology doctoral student,4
4,new york city,4
...,...,...
95,level advice tip,2
96,advice tip folk,2
97,tip folk starting,2
98,folk starting weve,2


In [79]:
count_df5

Unnamed: 0,words,count
0,normal wear tear,4
1,uniform officer handle,3
2,started trip er,2
3,trip er two,2
4,er two follow,2
...,...,...
95,fixing broke boyfriend,1
96,broke boyfriend recently,1
97,boyfriend recently gonna,1
98,recently gonna move,1


In [89]:
index=sent[sent['selftext'].str.contains("car")].index

In [95]:
#example of calculating average sentiment score of a given word in positive posts
sent.iloc[index][sent['label']==1]['compound'].mean()

  """Entry point for launching an IPython kernel.


0.8337500000000001

In [96]:
#example of calculating average sentiment score of a given word in positive posts
index=sent[sent['selftext'].str.contains("pay")].index
sent.iloc[index][sent['label']==-1]['compound'].mean()

  This is separate from the ipykernel package so we can avoid doing imports until


-0.7845850000000001

In [99]:
index=sent[sent['selftext'].str.contains("credit card")].index
sent.iloc[index][sent['label']==1]['compound'].mean()

  


0.8642571428571427

In [100]:
index=sent[sent['selftext'].str.contains("credit card")].index
sent.iloc[index][sent['label']==-1]['compound'].mean()

  


-0.9605