### Creating a label generator class

Purpose of this Notebook :
*  Reuse the labelling technique for every dataset
*  Easy to test for any extensions/changes/improvements

**Labelling Technique Credits : Harshita**

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
# ensure cleaning.py is in the same directory
# from cleaning import clean_text
import nltk
nltk.download()
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
STOPWORDS =  stopwords.words('english') + ['twitter','com']
from nltk.util import ngrams  
from heapq import nlargest
import collections
import re

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


### Labelse generator class. One object of it can be used per dataset to generate labels for it

In [0]:
class label_generator:
    def __init__(self,model,dataset,data_col,no_top_words):
        self.data = dataset
        self.col = data_col
        self.model = model
        self.no_top_words = no_top_words

    def preprocess(self):
        # Drop NaN values and reset index
        self.data = self.data.dropna(subset=['text']).reset_index(drop=True)
        # Clean text
        self.clean_text()
        # Generate features
        self.all_text = ' '.join(str(word) for word in self.data[self.col].values) 
        self.tokens = word_tokenize(self.all_text)
        self.vectorizer = TfidfVectorizer(max_features = 5000, ngram_range=(1,2))
        self.tf = self.vectorizer.fit_transform(self.data[self.col]).toarray()
        self.tf_feature_names = self.vectorizer.get_feature_names()
    
  
    def remove_URL(self,text):
        self.url_pattern = re.compile(r'https?://\S+|www\.\S+|pic\.twitter\S+')
        return self.url_pattern.sub(r'', text)
    
    def lemmatize_text(self,text):
        return " ".join([wn.morphy(word) if wn.morphy(word) != None 
                         else word 
                         for word in text.split()])

    def get_topics(self):
        self.preprocess()
        self.topic_dict = {}
        self.model.fit(self.tf)
        for topic_idx, topic in enumerate(self.model.components_):
            self.topic_dict["Topic %d words" % (topic_idx)] = ['{}'.format(self.tf_feature_names[i]) 
                                                         for i in topic.argsort()[:-self.no_top_words - 1:-1]]
        return pd.DataFrame(self.topic_dict)

    
    def clean_text(self):
        # print("Cleaning text and adding column 'processed_text'")
        self.data['processed_text'] = self.data[self.col]
        # Converting to lower case
        self.data['processed_text'] = self.data['processed_text'].str.lower()
        # Removing /n characters
        self.data['processed_text'] = self.data['processed_text'].apply(lambda x: x.replace('\n', ' '))
        # Removing urls
        self.data['processed_text'] = self.data['processed_text'].apply(lambda text: self.remove_URL(text))
        # Removing the stopwords
        self.data['processed_text'] = self.data['processed_text'].apply(lambda text: " ".join([word for word in str(text).split() 
                                                                                               if word not in STOPWORDS]))
        # Lemmatization of text
        self.data['processed_text'] = self.data['processed_text'].apply(lambda text: self.lemmatize_text(text))

        self.col = 'processed_text'
        self.data.drop_duplicates(subset=self.col,inplace=True)
        self.data.reset_index(drop=True,inplace=True)

                                                                        
    def get_binary_labeled_dataset(self):
        
        self.topics = self.get_topics()
        self.dv_topics = self.topics.columns[0]
        self.other_topics = self.topics.columns[1]
        self.data['label'] = ['']*len(self.data)
        
        # DV Relationships 
        t1list = ['husband','wife','daughter','father','relative','mother','sister','uncle','grandfather','neighbour', \
                  'parent','child','cousin','inlaw','in-law','boyfriend','marital','domestic','partner','family','maid', \
                  'housemaid','gay','ex','liquor']
        
        # DV actions and keywords
        t2list = ['abuse','violence','domestic','sexual','harass','assault','bitch','torture','rape','beat','dowry','porn', \
                  'acidattack','lockdown','molest','metoo','slap','fuck','mental','physical','threat', 'push','pull','murder', \
                  'body','opress','force','bruise','slut','scar', 'threaten','burn','toxic','cheat','verbal','blackmail','sos', \
                  'helpline','help']
        
        # Evident harassment (must be present)
        t3list = ['harass','sexual','physical','rape','shame','abuse','case','help','helpline','sos','save','police']
        
        # Awareness and opinions
        t4list = ['prevent','law','aware','survey','webinar','initiative','pandemic','commission','guide','animal','govt', \
                  'article','responsibility','delete','work','tiktok','bantiktok','content','tik','tok','action','equality', \
                  'politics','employee','company','right','movie','feminism','film','online','party','cyclone']

        for i in range(len(self.data)):
            if any(word in self.data[self.col][i] for word in t4list) \
            and self.data.label[i] == '':
                self.data.label[i] = 'NO_DV'
            
            elif any(word in self.data[self.col][i] for word in t1list) \
            and any(word in self.data[self.col][i] for word in t2list) \
            and any(word in self.data[self.col][i] for word in t3list) \
            and self.data.label[i] == '':
                self.data.label[i] = 'DV'

        return self.data

In [0]:
def get_labelled_df(PATH,col,number_of_topics,no_of_top_words):
  data = pd.read_csv(PATH)
  model = LatentDirichletAllocation(n_components = number_of_topics, random_state = 42)
  labelizer = label_generator(model=model,dataset=data,data_col=col,no_top_words=no_of_top_words)
  
  return labelizer.get_binary_labeled_dataset(),labelizer.get_topics(),labelizer.topics

## To create labels and get the topics used to create labels, call the function 

    get_labelled_df(PATH_TO_FILE,NO_OF_TOPICS,NO_OF_TOP_WORDS)

*  PATH_TO_FILE : To retrieve the dataset
*  NO_OF_TOPICS : Number of classes we need (In this case 2 i.e, DV and NO_DV)
*  NO_OF_TOP_WORDS : Number of words considered to create each label


In [0]:
df,_,topics = get_labelled_df("/content/drive/My Drive/Omdena/twitter/more-tweets-extracted.csv",'text',2,20)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [0]:
df

Unnamed: 0,user_id,screen_name,tweet_id,tweet_url,timestamp,text,hashtags,has_media,img_urls,video_url,user_location,processed_text,label
0,1139435031204548608,zkwnsaari,1262868325803024389,/zkwnsaari/status/1262868325803024389,2020-05-19 22:10:46,"Cornering men with the phrase ""sexual harassme...",[],0.0,[],,"Perak, Malaysia","cornering men with the phrase ""sexual harassme...",
1,1262662724628238338,anuglywoman1,1262856464651243525,/anuglywoman1/status/1262856464651243525,2020-05-19 21:23:38,Sexual harassment..is to make someone fucked u...,[],0.0,[],,Pain,sexual harassment..is to make someone fucked u...,
2,15199808,Vidyut,1262838343404044296,/Vidyut/status/1262838343404044296,2020-05-19 20:11:38,Fellow asking about abuse women face online/st...,[],0.0,[],,India,fellow asking about abuse women face online/st...,NO_DV
3,257394747,PramodChturvedi,1262824245517672449,/PramodChturvedi/status/1262824245517672449,2020-05-19 19:15:36,Case registered against employee of private co...,"['AndhraPradesh', 'Krishna']",0.0,[],,"Hyderabad, New Delhi, Ballia",case registered against employee of private co...,NO_DV
4,2247560024,FeminismInIndia,1262816952164196355,/FeminismInIndia/status/1262816952164196355,2020-05-19 18:46:38,Institutional Failures & The Increasing Relian...,[],0.0,[],,India,institutional failures & the increasing relian...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1690,857761954018058241,satpal13414,1212326306739703808,/satpal13414/status/1212326306739703808,2020-01-01 10:54:50,#स्वर्ण_युग\nGolden Time Is Coming\nThe other ...,['स'],1.0,['https://pbs.twimg.com/media/ENMMVmYU0AAim4b....,,"Rewari, India",#स्वर्ण_युग golden time is coming the other wo...,DV
1691,1156956207640244224,Mohit__solanki,1212256040089075712,/Mohit__solanki/status/1212256040089075712,2020-01-01 06:15:37,Golden Time Is Coming\nThe other woman and gir...,['स'],0.0,[],,"Firozpur, India",golden time is coming the other woman and girl...,DV
1692,838337330322751488,BRAJBHANDAS1234,1212234940441493505,/BRAJBHANDAS1234/status/1212234940441493505,2020-01-01 04:51:46,#स्वर्ण_युग#स्वर्ण_युग\nGolden Time Is Coming\...,"['स', 'स']",1.0,['https://pbs.twimg.com/media/ENK5O_DUwAEnmUY....,,"Gandhidham, India",#स्वर्ण_युग#स्वर्ण_युग golden time is coming t...,DV
1693,972168388490379264,NiteshP82110245,1212212166549331969,/NiteshP82110245/status/1212212166549331969,2020-01-01 03:21:16,#HeavenOnEarth_By_SaintRampalJi\n Time Is Comi...,['HeavenOnEarth_By_SaintRampalJi'],1.0,['https://pbs.twimg.com/media/ENKkdxCVAAAxcxF....,,"मध्य प्रदेश, भारत",#heavenonearth_by_saintrampalji time is comin...,DV


In [0]:
topics

Unnamed: 0,Topic 0 words,Topic 1 words
0,pic,the
1,bantiktokinindia,of
2,bantiktok,to
3,abuse,and
4,attack,sexual
5,acid attack,in
6,of,is
7,sexual,for
8,promoting,harassment
9,acid,on


In [0]:
df,_,topics = get_labelled_df("/content/drive/My Drive/Omdena/reddit_domestic_violence.csv",'title',2,20)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [0]:
df

Unnamed: 0.1,Unnamed: 0,title,score,id,url,comms_num,created,body,year,month,day,time,violence_type,age,processed_text,label
0,0,There is NO ROOM for abuse.Period. Digital Pai...,3114,gfoc94,https://i.redd.it/xfvcce15rhx41.jpg,329,1.588951e+09,,2020.0,5.0,8.0,15:12:12,abuse,,there is no room for abuse.period. digital pai...,
1,1,"Behind closed doors, the biggest viruses are m...",1128,fzzzji,https://i.redd.it/dc1v110v0fs41.jpg,213,1.586738e+09,,2020.0,4.0,13.0,00:40:34,abuse,,"behind closed doors, the biggest viruses are m...",DV
2,2,"Manipur girl racially attacked, abused by loca...",365,gici3u,https://newsd.in/manipur-girl-racially-attacke...,62,1.589323e+09,,2020.0,5.0,12.0,22:43:00,abuse,,"manipur girl racially attacked, abused by loca...",
3,3,Chennai man caught on camera hurling casteist ...,402,g30ho5,https://www.thenewsminute.com/article/chennai-...,114,1.587153e+09,,2020.0,4.0,17.0,19:49:59,abuse,,chennai man caught on camera hurling casteist ...,NO_DV
4,5,Parental Abuse: What to do?,31,ggfu6u,https://www.reddit.com/r/india/comments/ggfu6u...,49,1.589061e+09,I am writing this with such a heavy heart. But...,2020.0,5.0,9.0,21:56:50,abuse,20.0,parental abuse: what to do?,DV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,532,Do the parties signing a legal contract have c...,1,gbeobj,https://www.reddit.com/r/legaladvice/comments/...,0,1.588352e+09,I am a 16 yo developer and I have an idea abou...,2020.0,5.0,1.0,16:59:43,,16.0,do the parties signing a legal contract have c...,
438,533,Is infanticide heavily biased against girls?,8,fdhja3,https://www.reddit.com/r/MensRights/comments/f...,12,1.583376e+09,You may know how female infanticide is notorio...,2020.0,3.0,5.0,02:41:15,,,is infanticide heavily biased against girls?,
439,534,Domestic Abuse Case in GA of USA,1,g9xwxd,https://www.reddit.com/r/legaladvice/comments/...,0,1.588144e+09,Hello. I want to help my mother fight against...,2020.0,4.0,29.0,07:10:06,,15.0,domestic abuse case in ga of usa,DV
440,535,Notarization confusion,1,fslkkw,https://www.reddit.com/r/legaladvice/comments/...,6,1.585718e+09,So I'm an international student from India and...,2020.0,4.0,1.0,05:18:40,,10.0,notarization confusion,


In [0]:
df.label.value_counts()

         343
DV        66
NO_DV     33
Name: label, dtype: int64