# Inspecting the text data

In [2]:
import pandas as pd

df = pd.read_csv('../data/reddit/preprocessed_reddit_abusive_large.csv',sep=',', header=0, skiprows=[1])
df.head()

Unnamed: 0,subreddit,text,labels
0,holocaust,LOL,IMP
1,holocaust,"And Soccer "" A Lot of People Say Jews are Curs...",IMP
2,holocaust,Every racial group in America is free to creat...,EXP
3,holocaust,I'm about to cry .,IMP
4,holocaust,Maybe you would cry if the Jewish organization...,EXP


In [3]:
df.columns

Index(['subreddit', 'text', 'labels'], dtype='object')

In [4]:
# Print dataframe shape
df.shape

(2803506, 3)

In [21]:
# distribution of labels
df.labels.value_counts()

IMP    2152541
EXP     650965
Name: labels, dtype: int64

In [5]:
# Get the amount of tokens per message 
df['token_count'] = df['text'].astype(str).str.split().apply(len)
df.head()

Unnamed: 0,subreddit,text,labels,token_count
0,holocaust,LOL,IMP,1
1,holocaust,"And Soccer "" A Lot of People Say Jews are Curs...",IMP,63
2,holocaust,Every racial group in America is free to creat...,EXP,88
3,holocaust,I'm about to cry .,IMP,5
4,holocaust,Maybe you would cry if the Jewish organization...,EXP,12


In [6]:
# Calculate total amount of characaters used per message
df['char_count'] = df['text'].astype(str).apply(len)
df.head()

Unnamed: 0,subreddit,text,labels,token_count,char_count
0,holocaust,LOL,IMP,1,3
1,holocaust,"And Soccer "" A Lot of People Say Jews are Curs...",IMP,63,300
2,holocaust,Every racial group in America is free to creat...,EXP,88,478
3,holocaust,I'm about to cry .,IMP,5,18
4,holocaust,Maybe you would cry if the Jewish organization...,EXP,12,64


In [7]:
# Calculate total amount of characaters used per message without spaces
df['char_count_no_spacing'] = df['text'].astype(str).str.replace(" ", "").apply(len)
df.head()

Unnamed: 0,subreddit,text,labels,token_count,char_count,char_count_no_spacing
0,holocaust,LOL,IMP,1,3,3
1,holocaust,"And Soccer "" A Lot of People Say Jews are Curs...",IMP,63,300,238
2,holocaust,Every racial group in America is free to creat...,EXP,88,478,391
3,holocaust,I'm about to cry .,IMP,5,18,14
4,holocaust,Maybe you would cry if the Jewish organization...,EXP,12,64,53


In [8]:
pd.options.display.float_format = '{:.2f}'.format
df[['token_count', 'char_count', 'char_count_no_spacing']].describe()

Unnamed: 0,token_count,char_count,char_count_no_spacing
count,2803506.0,2803506.0,2803506.0
mean,30.07,149.41,120.34
std,50.57,261.05,212.14
min,1.0,1.0,1.0
25%,8.0,39.0,31.0
50%,17.0,81.0,65.0
75%,34.0,168.0,135.0
max,9357.0,85727.0,80531.0


In [9]:
# Calculate the total amount of tokens used in the messages
tokens_sum = df['token_count'].sum(axis=0)
print(tokens_sum)

84292510


In [10]:
# Calculate the total amount of characters used in the messages for both spacing and non-spacing
char_sum = df['char_count'].sum(axis=0)
char_sum_no_spacing = df['char_count_no_spacing'].sum(axis=0)
print(char_sum, char_sum_no_spacing)

418865005 337375862


In [11]:
# Calculate mean of words_count per message
df['token_count'].mean()

30.06681990336386

In [12]:
# Get average of characters per message (including spaces)
df['char_count'].mean()

149.40756502750486

# Collect Data from subreddits

In [15]:
# Load in subreddit data
df_subreddits = df.subreddit
df_subreddits.head()

0    holocaust
1    holocaust
2    holocaust
3    holocaust
4    holocaust
Name: subreddit, dtype: object

In [22]:
# Print Top 20 value counts of used subreddits
df_subreddits.value_counts()

fatpeoplehate           1465531
uncensorednews           577465
milliondollarextreme     547998
sjwhate                  165435
europeannationalism       20132
holocaust                 10242
nazi                       7010
pol                        5861
misogyny                   1035
hitler                      958
niggerspics                 449
niglets                     357
niggervideos                311
polacks                     209
niggas                      175
niggersstories               75
chimpmusic                   35
funnyniggers                 29
niggerhistorymonth           28
gibsmedat                    24
teenapers                    23
chicongo                     22
didntdonuffins               22
blackpeoplehate              16
muhdick                      15
whitesarecriminals           15
far_right                    11
niggerrebooted                6
klukluxklan                   6
apewrangling                  5
beatingfaggots                3
killthej

In [18]:
# Retrieve total unique subreddits
unique_subreddits = df_subreddits.unique()
len(unique_subreddits)

34

In [20]:
# Get statistics of subreddit data
df_subreddits.value_counts().describe()

count        34.00
mean      82456.06
std      279520.56
min           1.00
25%          15.00
50%          32.00
75%        1015.75
max     1465531.00
Name: subreddit, dtype: float64