# Inspecting the text data

In [1]:
import pandas as pd

df = pd.read_csv('preprocessed_non_abusive.csv', header=0, skiprows=1, index_col=0)
df.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,text,NOT
0,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Isn't this against the first amendment ? Doesn...,NOT
2,oil .,NOT
3,Yeah nice ... Except for the fact that everyon...,NOT
4,"Bro , I figuratively just watched that episode .",NOT
5,this is why u can't vote,NOT


In [2]:
# Print dataframe shape
df.shape

(5999880, 2)

In [3]:
# Get the amount of tokens per message 
df['token_count'] = df['text'].astype(str).str.split().apply(len)
df.head()

Unnamed: 0_level_0,text,NOT,token_count
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Isn't this against the first amendment ? Doesn...,NOT,20
2,oil .,NOT,2
3,Yeah nice ... Except for the fact that everyon...,NOT,48
4,"Bro , I figuratively just watched that episode .",NOT,9
5,this is why u can't vote,NOT,6


In [4]:
# Calculate total amount of characaters used per message
df['char_count'] = df['text'].astype(str).apply(len)
df.head()

Unnamed: 0_level_0,text,NOT,token_count,char_count
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Isn't this against the first amendment ? Doesn...,NOT,20,112
2,oil .,NOT,2,5
3,Yeah nice ... Except for the fact that everyon...,NOT,48,241
4,"Bro , I figuratively just watched that episode .",NOT,9,48
5,this is why u can't vote,NOT,6,24


In [5]:
# Calculate total amount of characaters used per message without spaces
df['char_count_no_spacing'] = df['text'].astype(str).str.replace(" ", "").apply(len)
df.head()

Unnamed: 0_level_0,text,NOT,token_count,char_count,char_count_no_spacing
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Isn't this against the first amendment ? Doesn...,NOT,20,112,93
2,oil .,NOT,2,5,4
3,Yeah nice ... Except for the fact that everyon...,NOT,48,241,194
4,"Bro , I figuratively just watched that episode .",NOT,9,48,40
5,this is why u can't vote,NOT,6,24,19


In [15]:
pd.options.display.float_format = '{:.2f}'.format
df[['token_count', 'char_count', 'char_count_no_spacing']].describe()

Unnamed: 0,token_count,char_count,char_count_no_spacing
count,5999880.0,5999880.0,5999880.0
mean,33.28,162.09,129.41
std,60.86,301.22,239.71
min,0.0,1.0,0.0
25%,8.0,36.0,29.0
50%,16.0,78.0,62.0
75%,36.0,173.0,138.0
max,11400.0,55099.0,43700.0


In [7]:
# Calculate the total amount of tokens used in the messages
tokens_sum = df['token_count'].sum(axis=0)
print(tokens_sum)

199664336


In [17]:
# Calculate the total amount of characters used in the messages for both spacing and non-spacing
char_sum = df['char_count'].sum(axis=0)
char_sum_no_spacing = df['char_count_no_spacing'].sum(axis=0)
print(char_sum, char_sum_no_spacing)

972545245 776434008


In [9]:
# Calculate mean of words_count per message
df['token_count'].mean()

33.27805489443122

In [10]:
# Get average of characters per message (including spaces)
df['char_count'].mean()

162.09411604898764

# Collect Data from subreddits

In [11]:
# Load in subreddit data
df_subreddits = pd.read_csv('subreddits.csv', index_col=0)
df_subreddits.head()

Unnamed: 0,subreddit
0,AskReddit
1,gaming
2,politics
3,WTF
4,pics


In [12]:
# Print Top 20 value counts of used subreddits
df_subreddits['subreddit'].value_counts()[:20]

AskReddit          276204
funny               87432
pics                65515
CFB                 52476
WTF                 49746
leagueoflegends     48854
gaming              47090
nba                 45911
AdviceAnimals       37435
SquaredCircle       35430
hockey              35147
videos              33239
politics            29668
nfl                 29599
todayilearned       26942
worldnews           22468
trees               22427
teenagers           18756
movies              16888
atheism             16882
Name: subreddit, dtype: int64

In [13]:
# Retrieve total unique subreddits
unique_subreddits = df_subreddits['subreddit'].unique()
len(unique_subreddits)

28523

In [14]:
# Get statistics of subreddit data
df_subreddits['subreddit'].value_counts().describe()

count    28523.00
mean       101.39
std       2015.60
min          1.00
25%          1.00
50%          3.00
75%         11.00
max     276204.00
Name: subreddit, dtype: float64