# Lexicon refining

In [1]:
import json, math
import pandas as pd
from nltk.stem.porter import *

## 1 Read in the lexicon

In [3]:
lexicon_path = '../../from Github/WebSci2019/lexicon.json'

In [4]:
# Read the Farrell's keyword file
with open(lexicon_path) as f:
    lexicon = json.load(f)

## 2 Define two functions

In [5]:
def lexicon_size(lexicon):
    size = 0
    for key in lexicon:
        size = size + len(lexicon[key])
    return size

def delete_elements(lexicon,to_be_deleted):
    for key in lexicon:
        for ele in lexicon[key]:
            if ele in to_be_deleted:
                lexicon[key].remove(ele)
    return lexicon

In [6]:
lexicon_size(lexicon)

1300

## 3 Delete the 'Racism' category

In [7]:
del lexicon['Racism']

In [8]:
lexicon_size(lexicon)

630

## 4 Stemming

In [9]:
ps = PorterStemmer()

lexicon_stemmed = dict()
for key in lexicon:
    ele_list = []
    for ele in lexicon[key]:
        ele_list.append(ps.stem(ele))
    ele_set = set(ele_list)
    lexicon_stemmed[key] = list(ele_set)

In [10]:
lexicon_size(lexicon_stemmed)

549

## 5 The first round of weights: Based on tweets on Feb 26, 2021

In [2]:
weights_1 = pd.read_csv('weights_1.csv')
weights_2 = pd.read_csv('weights_2.csv')
weights = pd.concat([weights_1,weights_2], ignore_index=True)

In [3]:
weights

Unnamed: 0.1,Unnamed: 0,n,mis_n
0,assault,1141,
1,asshole,2243,14.0
2,attack,5804,4.0
3,balls,7707,7.0
4,bang,2386,2.0
...,...,...,...
113,shake,1673,12.0
114,shoot,3212,25.0
115,slap,2157,13.0
116,smash,1675,10.0


In [4]:
is_na_list = []
for i in weights['mis_n']:
    if math.isnan(i):
        is_na_list.append(1)
    else:
        is_na_list.append(0)
weights['is_na'] = is_na_list
dropped_words_r1 = weights['Unnamed: 0'][weights['is_na'] == 1]

weights_kept = weights[weights['is_na'] == 0]
weights_kept['weight'] = weights_kept['mis_n'] / weights_kept['n']
weights_kept.sort_values(by='weight',ascending=False)[:15]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weights_kept['weight'] = weights_kept['mis_n'] / weights_kept['n']


Unnamed: 0.1,Unnamed: 0,n,mis_n,is_na,weight
9,bitch,16028,446.0,0,0.027826
10,bitches,15393,414.0,0,0.026895
47,pussies,2892,54.0,0,0.018672
48,pussy,3061,53.0,0,0.017315
49,pussys,2791,46.0,0,0.016482
86,fucker,1016,14.0,0,0.01378
34,hoe,2602,33.0,0,0.012683
35,hoes,2655,29.0,0,0.010923
73,boob,1308,14.0,0,0.010703
63,bang,2386,25.0,0,0.010478


In [5]:
len(weights_kept)

105

### 5.1 Also delete words with a weight of less than 0.02

In [17]:
a = weights_kept['Unnamed: 0'][weights_kept['weight'] < 0.02]
dropped_words_r1 = list(dropped_words_r1) + list(a)
len(dropped_words_r1)

201

In [18]:
lexicon_1 = delete_elements(lexicon_stemmed,dropped_words_r1)
lexicon_size(lexicon_1)

505

In [19]:
included_words_r1 = ['bitch','pussy','fucker','hoe','boob','bang','ram']

In [20]:
lexicon_1_for_scrapping = delete_elements(lexicon_1,included_words_r1)

In [21]:
with open('lexicon_1.json', 'w') as outfile:
    json.dump(lexicon_1, outfile)
with open('lexicon_1_for_scrapping.json', 'w') as outfile:
    json.dump(lexicon_1_for_scrapping, outfile)

## 6. weights_3 and Final lexicon and weights!

In [4]:
weights_1 = pd.read_csv('weights_1.csv')
weights_2 = pd.read_csv('weights_2.csv')
weights_12 = pd.concat([weights_1,weights_2], ignore_index=True)

In [5]:
is_na_list = []
for i in weights_12['mis_n']:
    if math.isnan(i):
        is_na_list.append(1)
    else:
        is_na_list.append(0)
weights_12['is_na'] = is_na_list

weights_kept_12 = weights_12[weights_12['is_na'] == 0]
weights_kept_12['weight'] = weights_kept_12['mis_n'] / weights_kept_12['n']
weights_kept_12.sort_values(by='weight',ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weights_kept_12['weight'] = weights_kept_12['mis_n'] / weights_kept_12['n']


Unnamed: 0.1,Unnamed: 0,n,mis_n,is_na,weight
9,bitch,16028,446.0,0,0.027826
10,bitches,15393,414.0,0,0.026895
47,pussies,2892,54.0,0,0.018672
48,pussy,3061,53.0,0,0.017315
49,pussys,2791,46.0,0,0.016482
...,...,...,...,...,...
26,force,4940,2.0,0,0.000405
18,crush,2672,1.0,0,0.000374
32,hit,18822,7.0,0,0.000372
44,murder,3051,1.0,0,0.000328


In [25]:
weights_kept_12_1 = weights_kept_12.sort_values(by='weight',ascending=False)[:10]

In [26]:
weights_kept_12_1 = weights_kept_12_1.drop([10,47,49,35])

In [27]:
weights_kept_12_1 = weights_kept_12_1.set_index('Unnamed: 0')

In [28]:
weights_kept_12_2 = weights_kept_12_1['weight']

In [29]:
normalized_weights_12=(weights_kept_12_2-weights_kept_12_2.min())/(weights_kept_12_2.max()-weights_kept_12_2.min())

In [30]:
normalized_weights_12

Unnamed: 0
bitch     1.000000
pussy     0.394086
fucker    0.190318
hoe       0.127087
boob      0.013003
bang      0.000000
Name: weight, dtype: float64

In [33]:
weights_3 = pd.read_csv('weights_3.csv')

In [34]:
is_na_list = []
for i in weights_3['mis_n']:
    if math.isnan(i):
        is_na_list.append(1)
    else:
        is_na_list.append(0)
weights_3['is_na'] = is_na_list

weights_kept_3 = weights_3[weights_3['is_na'] == 0]
weights_kept_3['weight'] = weights_kept_3['mis_n'] / weights_kept_3['n']
weights_kept_3.sort_values(by='weight',ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weights_kept_3['weight'] = weights_kept_3['mis_n'] / weights_kept_3['n']


Unnamed: 0.1,Unnamed: 0,n,mis_n,is_na,weight
258,skank fuck,4,1.0,0,0.25
91,cunt rag,5,1.0,0,0.2
36,bitch ass,5677,160.0,0,0.028184
89,cumslut,308,3.0,0,0.00974
210,mothafucka,144,1.0,0,0.006944
10,assfuck,293,2.0,0,0.006826
266,slut,5381,32.0,0,0.005947
317,whore,8075,32.0,0,0.003963
115,dumb fuck,1617,5.0,0,0.003092
90,cunt,8140,20.0,0,0.002457


In [39]:
weights_kept_3_1 = weights_kept_3.sort_values(by='weight',ascending=False)[:12]
weights_kept_3_1 = weights_kept_3_1.set_index('Unnamed: 0')
weights_kept_3_2 = weights_kept_3_1['weight']

# Change the weights of the highest two to be 0.03
weights_kept_3_2 = weights_kept_3_2.replace({0.25: 0.03, 0.2: 0.03})


In [40]:
normalized_weights_3=(weights_kept_3_2-weights_kept_3_2.min())/(weights_kept_3_2.max()-weights_kept_3_2.min())
normalized_weights_3

Unnamed: 0
skank fuck    1.000000
cunt rag      1.000000
bitch ass     0.935779
cumslut       0.283572
mothafucka    0.184706
assfuck       0.180516
slut          0.149429
whore         0.079271
dumb fuck     0.048481
cunt          0.026021
cock suck     0.009719
whitey        0.000000
Name: weight, dtype: float64

In [7]:
final_lexicon = ['bitch','pussy','fucker','hoe','boob','skank fuck','cunt rag','bitch ass','cumslut','mothafucka','assfuck','slut','whore','dumb fuck','cunt','cock suck']

In [43]:
final_weights = pd.concat([normalized_weights_12,normalized_weights_3])
final_weights.to_csv('final_weights.csv')

## 7 Standardize seperately and construct the final weights

In [8]:
weights_kept_12 = weights_kept_12.set_index('Unnamed: 0')

In [1]:
weights_kept_12_1 = weights_kept_12['weight']
weights_kept_12_1

NameError: name 'weights_kept_12' is not defined

In [18]:
for k,v in weights_kept_12_1.items():
    if k not in final_lexicon:
        del weights_kept_12_1[k]
weights_kept_12_1

RuntimeError: dictionary changed size during iteration

In [13]:
normalized_weights_12=(weights_kept_12_1-weights_kept_12_1.mean())/weights_kept_12_1.std()
normalized_weights_12

Unnamed: 0
asshole    0.044353
attack    -1.068058
balls     -1.024166
bang      -1.038198
beat      -0.981109
             ...   
shake      0.230896
shoot      0.353220
slap       0.001329
smash     -0.010039
strike     0.277909
Name: weight, Length: 105, dtype: float64