# Import libraries

In [77]:
import pandas as pd
import numpy as np
import preprocessor as p
import emojis
import re

# Loading data

In [57]:
def load_data(twitter_data):
    
    #load data 
    data = pd.read_csv(twitter_data)
    
    #print the dataset shape
    print('The number of tweets:\n{}\n\n'.format(data.shape[0]))
    
    #the feature of the data set are
    print('The features or columns in our dataset are {}'.format(list(data.columns)))
    
    return data

# Data analysis

In [66]:
def analysis(data):    
    
    #Print the number of tweets with hate language
    print('The number of hate tweets:\n{}\n\n'.format(data[data['label'] == 1].shape[0]))
    
    #the percentage of tweets with hate language
    print('The percentage of hate tweets:\n{}\n\n'.format((data[data['label']==1].shape[0]/data.shape[0])*100))
    
    #print the number of tweets without hate language
    print('The number of tweets not classified as hate tweets:\n{}\n\n'.format(data[data['label']==0].shape[0]))
    
    #the percentage of tweets without language
    print('The percentage of tweets without hate language:\n{}\n\n'.format((data[data['label']==0].shape[0]/data.shape[0])*100))
    

# Data cleaning

In [67]:
#function to remove alphanumeric and lower
def text_preprocessing(text,index,column,data):
    
    if type(text) is not int:
        strng = ""
        for words in text.split():
            
            #removing special characters 
            word = ("".join(i for i in words if i.isalnum()))
            
            #lowering the words
            word = word.lower()
            
            strng += word + " "
            
        data[column][index] = strng

In [68]:
def convert_emojis(text):
    text = emojis.decode(text)
    text = text.replace(":"," ")
    text  = ' '.join(text.split())
    
    return text

In [69]:
SMILEYS = {":‑(":"sad", ":‑(":"sad", ":(":"sad",":‑c":"sad",":c":"sad",":‑<":"sad",":<":"sad",":‑[":"sad",":[":"sad",":-||":"sad",">:[":"sad",":{":"sad",":@":"sad",":(	":"sad",";( ":"sad",":‑)":"happy", ":‑D" : "laughing", "8D":"laughing" , "x‑D": "laughing", "xD": "laughing","X‑D": "laughing",
"XD": "laughing","=D": "laughing", "=3": "laughing", "B^D": "laughing" , "c:" : "laughing", ":-]":"happy",":]": "happy",":-3": "happy", ":3":"happy", ":->": "happy",":>": "happy", "8-)": "happy","8)": "happy",":-}": "happy",":}": "happy",":o)": "happy",":c": "happy" , ":^)": "happy","=]": "happy", "=)": "happy",
":‑###..":"being sick",":###..":"being sick","',:-|":"disbelief","',:-l":"disbelief",">:‑)":"Evil","}:‑)":"Evil","}:)":"Evil","3:‑)":"Evil","3:)":"Evil",">;)":"Evil",";3":"Evil","D‑':":"horror"}

def convert_emoticons(text):
    words = text.split()
    reformed = [SMILEYS[word] if word in SMILEYS else word for word in words]
    text = " ".join(reformed)
    return text 

In [73]:
def data_cleaning(data):
    
    #removing null labels
    data = data[~data['label'].isnull()]
    print('The number of data point remaining after removing all null labels:\n{}\n\n'.format(data.shape[0]))
    
    #removing duplicate tweets
    data = data[~data['tweet'].duplicated()]
    print(' the number of data point remaning after removing all duplicate tweets:\n{}\n\n'.format(data.shape[0]))
    
    #cleaning the data according to our needa
    for i in data.index:    
        #removing the urls and 
        p.set_options(p.OPT.URL)
        tweet = p.clean(data['tweet'].loc[i])

        #removing punctuations
        tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())

        #conversion of emojis
        tweet = convert_emojis(tweet)

        #conversion of emoticons
        tweet = convert_emoticons(tweet)

        #removing the remaining emoticons and numbers data in text
        p.set_options(p.OPT.EMOJI,p.OPT.NUMBER)
        tweet = p.clean(tweet)

        #removing the alphanumeric variables in text and lowering texts 
        text_preprocessing(tweet,i,'tweet',data)
        
    #printing the processed tweet
    print('The tweet text data is processed{}'.format(data['tweet']))   

In [74]:
def main(Twitter_data):
    
        print('LOADING DATA...{}\n\n '.format(Twitter_data))
        data = load_data(Twitter_data)

        print('DATA ANALYSIS...\n\n')
        analysis(data)
        
        print('CLEANING DATA...\n\n')
        data = data_cleaning(data)
        
        return data

In [75]:
data = main('train.csv')

LOADING DATA...train.csv

 
The number of tweets:
31962


The features or columns in our dataset are ['id', 'label', 'tweet']
DATA ANALYSIS...


The number of hate tweets:
2242


The percentage of hate tweets:
7.014579813528565


The number of tweets not classified as hate tweets:
29720


The percentage of tweets without hate language:
92.98542018647143


CLEANING DATA...


The number of data point remaining after removing all null labels:
31962


 the number of data point remaning after removing all duplicate tweets:
29530




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column][index] = strng


The tweet text data is processed0        user when a father is dysfunctional and is so ...
1        user user thanks for lyft credit i cant use ca...
2                                     bihday your majesty 
3           model i love u take with u all the time in ur 
4                       factsguide society now motivation 
                               ...                        
31956    off fishing tomorrow user carnt wait first tim...
31957                             ate user isz that youuu 
31958    to see nina turner on the airwaves trying to w...
31959    listening to sad songs on a monday morning otw...
31961                       thank you user for you follow 
Name: tweet, Length: 29530, dtype: object
