# Internship Task 

Aim : To Find the degree of profanity in tweets by several users

In [42]:
import pandas as pd
import os

In [43]:
import nltk
nltk.download('punkt')  # downloading punkt for word tokenization
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [44]:
# File address of datasets
tweets_file = os.path.join('data','comments_to_score.csv')
slurs_file = os.path.join('data', 'profanity_en.csv')

In [45]:
# Loading datasets
tweets = pd.read_csv(tweets_file)
slurs = pd.read_csv(slurs_file)

In [46]:
# Filtering racial slurs
racial_slurs = slurs[slurs['category_1'] == 'racial / ethnic slurs']
racial_slurs

Unnamed: 0,text,canonical_form_1,canonical_form_2,canonical_form_3,category_1,category_2,category_3,severity_rating,severity_description
19,abbie,abraham,,,racial / ethnic slurs,,,1.2,Mild
20,abeed,abeed,,,racial / ethnic slurs,,,1.8,Strong
21,aboe,abo,,,racial / ethnic slurs,,,1.0,Mild
140,bean queen,bean queen,,,racial / ethnic slurs,,,2.2,Strong
141,beaner,beaner,,,racial / ethnic slurs,,,2.6,Severe
...,...,...,...,...,...,...,...,...,...
1589,wigger,nigger,,,racial / ethnic slurs,,,2.6,Severe
1594,wop,wop,,,racial / ethnic slurs,,,2.4,Strong
1595,wophead,wop,,,racial / ethnic slurs,,,2.2,Strong
1596,zip in the wire,zipperhead,,,racial / ethnic slurs,,,1.4,Mild


In [47]:
# Storing the racial words as list
racial_words = list(racial_slurs['text'])
racial_words[:10]

['abbie',
 'abeed',
 'aboe',
 'bean queen',
 'beaner',
 'beaners',
 'blacky',
 'buttermilk',
 'c00n',
 'c00nies']

In [48]:
def get_degree(sentence):
  """
  Method to get degree of profanity given a sentence
  """
  global racial_words
  words = word_tokenize(sentence)
  slurs_count = 0
  for word in words:
    if word in racial_words:
      slurs_count += 1
  return slurs_count / len(words)

In [49]:
# Calculating degree of profanity for each sentences in the dataset
tweets['degree_of_profanity'] = tweets['text'].apply(get_degree)

In [51]:
tweets.sort_values(by='degree_of_profanity',ascending=False).head(15)  # displaying top 15 sentences with high degree of profanity

Unnamed: 0,comment_id,text,degree_of_profanity
5556,326493334,nigger paedophile nigger paedophile,0.5
6892,439186891,fuck cocksucker titty marijuana nigger whore,0.166667
6378,393112091,your a fag \n\nfuck you nigger,0.166667
6320,386749389,You are such a nigger!,0.166667
4255,235671663,Hitler was a fucking kike.,0.166667
6773,430300744,I am a fucking retarded nigger,0.166667
3694,206412713,i dont give a fuck nigga,0.166667
7088,459378692,eat shit and die bitch ass nigga\n\neat shit a...,0.142857
7089,459379650,you can't keep me down nigger \nyou can't keep...,0.142555
2857,161658787,GWAH!! (fuck you nigger),0.125
