<a href="https://colab.research.google.com/github/Shehab-7/NLP/blob/main/Tweets%20Classification/Tweets_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Dataset**
labeled datasset collected from twitter

**Objective**
classify tweets containing hate speech from other tweets.
0 -> no hate speech
1 -> contains hate speech



### Import Libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
import random
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.model_selection import train_test_split
import re
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn import metrics


### Load Dataset

In [None]:
data = pd.read_csv('/content/dataset.csv')

In [None]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


### EDA

- check NaNs

In [None]:
data.isnull().any()


id       False
label    False
tweet    False
dtype: bool

- check duplicates

In [None]:
data['tweet'].duplicated().sum()

2432

- show samples of data texts to find out required preprocessing steps

In [None]:
data['tweet'].head(50)

0      @user when a father is dysfunctional and is s...
1     @user @user thanks for #lyft credit i can't us...
2                                   bihday your majesty
3     #model   i love u take with u all the time in ...
4                factsguide: society now    #motivation
5     [2/2] huge fan fare and big talking before the...
6      @user camping tomorrow @user @user @user @use...
7     the next school year is the year for exams.ð...
8     we won!!! love the land!!! #allin #cavs #champ...
9      @user @user welcome here !  i'm   it's so #gr...
10     â #ireland consumer price index (mom) climb...
11    we are so selfish. #orlando #standwithorlando ...
12    i get to see my daddy today!!   #80days #getti...
13    @user #cnn calls #michigan middle school 'buil...
14    no comment!  in #australia   #opkillingbay #se...
15    ouch...junior is angryð#got7 #junior #yugyo...
16    i am thankful for having a paner. #thankful #p...
17                               retweet if you 

- check dataset balancing

In [None]:
data['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

- Cleaning and Preprocessing are:
    - 1- Drop emojis
    - 2- Drop @user
    - 3- Drop Hashtags
    - 4- Drop Duplicates

### Cleaning and Preprocessing

In [None]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
     "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
     "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
          u"\U0001F600-\U0001F64F"  # emoticons
          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
          u"\U0001F680-\U0001F6FF"  # transport & map symbols
          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
          u"\U00002500-\U00002BEF"  # chinese char
          u"\U00002702-\U000027B0"
          u"\U00002702-\U000027B0"
          u"\U000024C2-\U0001F251"
          u"\U0001f926-\U0001f937"
          u"\U00010000-\U0010ffff"
          u"\u2640-\u2642" 
          u"\u2600-\u2B55"
          u"\u200d"
          u"\u23cf"
          u"\u23e9"
          u"\u231a"
          u"\ufe0f"  # dingbats
          u"\u3030"
       
                        "]+", re.UNICODE)
    return emoji_pattern.sub(r'', text)

def word_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

def replace_abbrev(text):
    string = ""
    for word in text.split():
        string += word_abbrev(word) + " "        
    return string

def remove_mention(text):
    at=re.compile(r'@\S+')
    return at.sub(r'USER',text)

def remove_number(text):
    num = re.compile(r'[-+]?[.\d][\d]+[:,.\d]')
    return num.sub(r'NUMBER', text)

def clean_text(text):
    text = remove_emoji(text)
    text = replace_abbrev(text)  
    text = remove_mention(text)
    text = remove_number(text)
    text = remove_punct(text)
    return text

def remove_hashtag(string):
    return re.sub(r"#","",string)

def drop_users(string):
    return re.sub(r"@\S+","",string)

In [None]:
data['tweet'] = data['tweet'].apply(lambda x: remove_emoji(x))
data['tweet'] = data['tweet'].apply(lambda x: word_abbrev(x))
data['tweet'] = data['tweet'].apply(lambda x: replace_abbrev(x))
data['tweet'] = data['tweet'].apply(lambda x: remove_mention(x))
data['tweet'] = data['tweet'].apply(lambda x: remove_number(x))
data['tweet'] = data['tweet'].apply(lambda x: remove_number(x))
data['tweet'] = data['tweet'].apply(lambda x: remove_hashtag(x))
data['tweet'] = data['tweet'].apply(lambda x: drop_users(x))

In [None]:
data['tweet'].sample(50)

18766    choosing my thoughts i choose to be blessed gr...
28195    USER i stopped after 3 but there are enough tw...
9558       !!! no-balls to 'own' his ... 'coward' !!! /// 
21771    USER you should probably read this account. th...
16060    ðpre-bihday celebration. ð junebaby bihd...
30794    weekend fun with my girls sunday blondie smile...
15680    my new furbaby...familylife awesome cool cooln...
29450       i'm happy to make myself happy ð¸ ps4 gamer 
17610    bihday ããããçææ°ç´ãã ã¸ã£ã...
12678    when a season ends with a cliffhanger..fcku ha...
13269    "a flower does not think of competing with the...
21643    USER a fantastic read. surely something needs ...
12811    schadenfreude: had a job all my life. these wa...
23257    to learn more about hungerinhouston from USER ...
4018     USER hey when are you guys gonna release the g...
28430    USER european formulaone grand prix USER ð ...
3856     USER USER USER USER i have never ever been cal.

In [None]:
data.to_csv('preprocessed_data.csv')  

**If it takes 60 Mins till here, you are doing Great** <br>
**If not! You also are doing Great**

### Modelling

In [None]:
train,test = train_test_split(data,test_size=0.3)

In [None]:
vec = CountVectorizer()
clf = LogisticRegression()
pipe = make_pipeline(vec, clf)
pipe.fit(train.tweet, train.label);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### Evaluation

In [None]:
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe, test.tweet, test.label)

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      8908
           1       0.84      0.49      0.62       681

    accuracy                           0.96      9589
   macro avg       0.90      0.74      0.80      9589
weighted avg       0.95      0.96      0.95      9589

accuracy: 0.957


### Enhancement

- Using different N-grams
- Using different text representation technique

In [None]:
vec_2 = CountVectorizer(ngram_range=(1, 3))
clf_2 = LogisticRegression()
pipe_2 = make_pipeline(vec_2, clf_2)
pipe_2.fit(train.tweet, train.label);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
print_report(pipe_2, test.tweet, test.label)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      8908
           1       0.90      0.43      0.58       681

    accuracy                           0.96      9589
   macro avg       0.93      0.71      0.78      9589
weighted avg       0.95      0.96      0.95      9589

accuracy: 0.956


In [None]:
vec_3 = CountVectorizer(ngram_range=(1, 5))
clf_3 = LogisticRegression()
pipe_3 = make_pipeline(vec_3, clf_3)
pipe_3.fit(train.tweet, train.label);

In [None]:
print_report(pipe_3, test.tweet, test.label)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      8908
           1       0.91      0.40      0.56       681

    accuracy                           0.95      9589
   macro avg       0.93      0.70      0.77      9589
weighted avg       0.95      0.95      0.95      9589

accuracy: 0.955


In [None]:
vec_4 = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), min_df=.01, max_df=.3)
clf_4 = LogisticRegression()
pipe_4 = make_pipeline(vec_4, clf_4)
pipe_4.fit(train.tweet, train.label);

In [None]:
print_report(pipe_4, test.tweet, test.label)

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      8908
           1       0.84      0.40      0.54       681

    accuracy                           0.95      9589
   macro avg       0.90      0.70      0.76      9589
weighted avg       0.95      0.95      0.94      9589

accuracy: 0.952


In [None]:
print(data.tweet[10:15])
print(pipe.predict(data.tweet[10:15]))

10    â ireland consumer price index (mom) climbed...
11    we are so selfish. orlando standwithorlando pu...
12     i get to see my daddy today!! 80days gettingfed 
13    USER cnn calls michigan middle school 'build t...
14    no comment! in australia opkillingbay seasheph...
Name: tweet, dtype: object
[0 0 0 1 1]


### After saving the preprocessed data we'll try to apply an advanced techniques on it

In [None]:
preprocessed_data = pd.read_csv("/content/preprocessed_data.csv")
preprocessed_data

Unnamed: 0.1,Unnamed: 0,id,label,tweet
0,0,1,0,USER when a father is dysfunctional and is so ...
1,1,2,0,USER USER thanks for lyft credit i can't use c...
2,2,3,0,bihday your majesty
3,3,4,0,model i love you take with you all the time in...
4,4,5,0,factsguide: society now motivation
...,...,...,...,...
31957,31957,31958,0,ate USER isz that youuu?ðððððð...
31958,31958,31959,0,to see nina turner on the airwaves trying to w...
31959,31959,31960,0,listening to sad songs on a monday morning otw...
31960,31960,31961,1,"USER sikh temple vandalised in in calgary, wso..."


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_features=1000
y = preprocessed_data.drop(['tweet','id','Unnamed: 0'], axis = 1)
tokenizer=Tokenizer(num_words=max_features,split=' ')
tokenizer.fit_on_texts(preprocessed_data['tweet'].values)
X = tokenizer.texts_to_sequences(preprocessed_data['tweet'].values)
X = pad_sequences(X)

In [None]:
y.head(25)

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [None]:
Xtrain, X_test, ytrain, y_test = train_test_split(X,y, test_size = 0.4, random_state =41)

X_train, X_val, y_train, y_val = train_test_split(Xtrain,ytrain, test_size = 0.4, random_state =41)

In [None]:
from tensorflow import keras
from keras import models, layers
from keras.layers.recurrent import GRU

embed_dim=32

model_g = models.Sequential()
model_g.add(layers.Embedding(max_features, embed_dim,input_length = X.shape[1]))
model_g.add(GRU(embed_dim, dropout=0.4, recurrent_dropout=0.5))

model_g.add(layers.Dense(1,activation='sigmoid'))
model_g.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
his3 = model_g.fit(X_train, y_train, batch_size=128, epochs=5, 
          verbose=1, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
score = model_g.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model_g.metrics_names[1], score[1]*100))

accuracy: 94.20%


#### Done!