In [1]:
import pandas as pd
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

# initialize nlp
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')
nlp.select_pipes(disable=['tok2vec','tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])

json_files = ['buprenorphine.json', 'codeine.json', 'heroin.json', 'hydrocodone.json', 'hydromorphone.json', 
               'methadone.json', 'morphine.json', 'opium-poppy.json', 'oxycodone.json', 'oxymorphone.json', 'tramadol.json']

df = pd.DataFrame()
dfs = []
for file in json_files:
    dfs.append(pd.read_json(file))

df = pd.concat(dfs)

# remove commas from rep points values
df['rep_points'] = df['rep_points'].str.replace(',', '').astype(int)

df.head()

Unnamed: 0,username,post_content,rank,rep_points,messages,join_date,country_of_origin,date,post_title,post_type
0,DanDone,Hi. I have a small codeine phosphate habit - I...,Newbie,10,8,"Mar 16, 2016",from U.K.,2023-10-18,Buprenorphine patch for codeine withdrawals,Question
1,TheBigBadWolf,Hallo @Dan.\nTrying to get yourself straight a...,Gold Member,16967,9563,"Apr 11, 2010",from Germany,2023-10-18,Buprenorphine patch for codeine withdrawals,Question
2,DanDone,Thank you. I do have some pregabalin so will c...,Newbie,10,8,"Mar 16, 2016",from U.K.,2023-10-18,Buprenorphine patch for codeine withdrawals,Question
3,TheBigBadWolf,That's a good decision. \nAs I Had Said bupren...,Gold Member,16967,9563,"Apr 11, 2010",from Germany,2023-10-18,Buprenorphine patch for codeine withdrawals,Question
4,Archangel Zadkiel,Thanks for the shout out @TheBigBadWolf long t...,Palladium Member,3445,1936,"Apr 3, 2012","44\ny/o\nfrom Cincinnati, Ohio",2023-10-19,Buprenorphine patch for codeine withdrawals,Question


Check to see if users with different ranks tend to have different sentiment in the way they speak. Lower 25th percentile is from 0 to 10, 50th percentile is from 11 to 80, 75th percentile is from 81 to 848, max is 18340. Looking at sentiment for the 25th percentile, most polarity is around 0. 

In [30]:
# percentile25 = df[(df['rep_points'] >= 0) & (df['rep_points'] <= 10)]

# Testing NLP
text = 'I think we should not do that. It is not a good idea in my opinion'
doc = nlp(text)
doc._.blob.sentiment

percentile_25 = df[(df['rep_points'] >= 0) & (df['rep_points'] <= 10)]['post_content']
# loop through and find average sentiment
print('Length of 25th percentile: ', len(percentile_25))
p25_avg_polarity = 0
for msg in percentile_25[30000:31000]:
    pol = float(nlp(msg)._.blob.sentiment.polarity)
    p25_avg_polarity += pol
    print('Polarity: ', pol)

p25_avg_polarity = p25_avg_polarity / len(percentile_25)
print(p25_avg_polarity)
print('\n\n\n')


#percentile_50 = df[(df['rep_points'] > 10) & (df['rep_points'] <= 80)]['post_content'].apply(lambda text: nlp(text)._.blob.sentiment)
#percentile_50.describe()
percentile_50 = df[(df['rep_points'] >= 11) & (df['rep_points'] <= 80)]['post_content']
print('Length of 50th percentile: ', len(percentile_50))
p50_avg_polarity = 0
for msg in percentile_50:
    pol = float(nlp(msg)._.blob.sentiment.polarity)
    p50_avg_polarity += pol
    print('Polarity: ', pol)

p50_avg_polarity = p50_avg_polarity / len(percentile_50)
print(p50_avg_polarity)

#percentile_75 = df[(df['rep_points'] > 80) & (df['rep_points'] <= 848)]['post_content'].apply(lambda text: nlp(text)._.blob.sentiment)
#percentile_75.describe()

#percentile_100 = df[(df['rep_points'] > 848)]['post_content'].str.astype(str).apply(lambda text: nlp(text)._.blob.sentiment)
#percentile_100.describe()

Length of 50th percentile:  29869
Polarity:  0.1252323107926556
Polarity:  0.008233333333333344
Polarity:  -0.17857142857142858
Polarity:  0.55
Polarity:  0.11645833333333334
Polarity:  0.15244791666666668
Polarity:  0.0898396164021164
Polarity:  -0.12199074074074077
Polarity:  0.11786075036075036
Polarity:  -0.16764705882352943
Polarity:  -0.18409090909090908
Polarity:  0.2
Polarity:  0.06419753086419752
Polarity:  0.0
Polarity:  0.1
Polarity:  -0.05449735449735451
Polarity:  0.1478787878787879
Polarity:  0.07500000000000001
Polarity:  -0.06666666666666667
Polarity:  0.2232142857142857
Polarity:  0.24833333333333332
Polarity:  0.19857142857142862
Polarity:  -0.27499999999999997
Polarity:  0.6
Polarity:  0.24944444444444444
Polarity:  0.09183673469387754
Polarity:  -0.13333333333333333
Polarity:  -0.2
Polarity:  0.225
Polarity:  0.13592592592592592
Polarity:  0.3625
Polarity:  0.1875
Polarity:  0.32291666666666663
Polarity:  0.25527777777777777
Polarity:  0.11960393407761831
Polarity: 

KeyboardInterrupt: 

In [6]:
percentile_50 = df[(df['rep_points'] >= 11) & (df['rep_points'] <= 80)]['post_content']
print('Length of 50th percentile: ', len(percentile_50))
p50_avg_polarity = 0
for msg in percentile_50[25000:27000]:
    pol = float(nlp(msg)._.blob.sentiment.polarity)
    p50_avg_polarity += pol
    print('Polarity: ', pol)

p50_avg_polarity = p50_avg_polarity / len(percentile_50)
print(p50_avg_polarity)

Length of 50th percentile:  29869
Polarity:  0.14740740740740743
Polarity:  0.04875514403292182
Polarity:  0.13898809523809522
Polarity:  0.04360243055555558
Polarity:  -0.0019480519480519368
Polarity:  0.2
Polarity:  0.1730739421868454
Polarity:  0.17666666666666667
Polarity:  0.20490196078431375
Polarity:  0.0757563025210084
Polarity:  0.05204783950617289
Polarity:  0.25
Polarity:  0.23035714285714284
Polarity:  0.08333333333333333
Polarity:  0.0269607843137255
Polarity:  0.20181818181818184
Polarity:  -0.125
Polarity:  0.15
Polarity:  0.21666666666666665
Polarity:  0.2
Polarity:  0.13636363636363635
Polarity:  -0.5
Polarity:  -0.08384920634920637
Polarity:  0.18
Polarity:  0.1714844990079365
Polarity:  0.0
Polarity:  0.25069444444444444
Polarity:  0.5
Polarity:  -0.045454545454545456
Polarity:  -0.03587454212454212
Polarity:  0.07942708333333336
Polarity:  0.0
Polarity:  -0.02642857142857144
Polarity:  0.23863636363636365
Polarity:  0.006250000000000026
Polarity:  0.2903703703703704

In [None]:
percentile_75 = df[(df['rep_points'] >= 81) & (df['rep_points'] <= 848)]['post_content']
print('Length of 75th percentile: ', len(percentile_75))
p75_avg_polarity = 0
for msg in percentile_75[25000:27000]:
    pol = float(nlp(msg)._.blob.sentiment.polarity)
    p75_avg_polarity += pol
    print('Polarity: ', pol)

p75_avg_polarity = p75_avg_polarity / len(percentile_75)
print(p75_avg_polarity)

Analyze Relationship between user sentiment and timeframe:

In [5]:
import datetime
import matplotlib.pyplot as plt

df['date']
for i, row in df['date'].items():
    row = str(row)
    year = row[:4]
    month = row[5:7]
    day = row[8:10]
    date = datetime.datetime(int(year), int(month), int(day))

dates = df['date'].apply(lambda text: datetime.datetime(int(str(text)[:4]), int(str(text)[5:7]), int(str(text)[8:10])))

dates = dates.sort_values()

dates_count = dates.value_counts()
print(type(dates_count))
for i in dates_count:
    print(i)


<class 'pandas.core.series.Series'>
112
102
99
96
92
92
92
91
90
90
89
88
88
87
87
84
83
83
83
82
82
80
80
79
78
78
78
78
77
77
77
77
77
76
76
76
75
75
75
73
73
73
73
73
73
73
73
72
72
72
71
71
71
71
71
71
70
70
70
70
70
70
69
69
69
69
69
68
68
68
68
68
68
68
67
67
67
67
67
67
66
66
66
66
66
66
65
65
65
65
65
65
65
65
64
64
64
64
64
64
64
64
64
64
64
64
64
64
63
63
63
63
63
63
63
63
63
63
63
63
63
62
62
62
62
62
62
62
62
62
62
62
62
62
62
62
62
62
62
61
61
61
61
61
61
61
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
59
59
59
59
59
59
59
59
59
59
59
59
59
59
59
59
59
59
58
58
58
58
58
58
58
58
58
58
58
58
58
58
58
58
58
58
58
58
57
57
57
57
57
57
57
57
57
57
57
57
57
57
57
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
56
55
55
55
55
55
55
55
55
55
55
55
55
55
55
55
55
55
55
55
55
55
55
55
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53