In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [None]:
np.random.seed(500)

In [None]:
Corpus = pd.read_csv('Hope_ENG_train.csv',usecols=['Text','Label'])

Corpus

Unnamed: 0,Text,Label
0,these tiktoks radiate gay chaotic energy and i...,Non_hope_speech
1,@Champions Again He got killed for using false...,Non_hope_speech
2,It's not that all lives don't matter,Non_hope_speech
3,Is it really that difficult to understand? Bla...,Non_hope_speech
4,Whenever we say black isn't that racists? Why...,Non_hope_speech
...,...,...
22735,It's a load of bollocks every life matters sim...,Non_hope_speech
22736,no say it because all lives matter! deku would...,Non_hope_speech
22737,God says her life matters,Non_hope_speech
22738,This video is just shit. A bunch of whiny ass ...,Non_hope_speech


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Step - a : Remove blank rows if any.
Corpus['Text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['Text'] = [entry.lower() for entry in Corpus['Text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['Text']= [word_tokenize(entry) for entry in Corpus['Text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['Text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

In [None]:
Corpus1 = pd.read_csv('Hope_ENG_dev.csv')
Corpus1.columns =['Text', 'Label']
Corpus1

Unnamed: 0,Text,Label
0,@Generation X Counting money that she been giv...,Non_hope_speech
1,@Paola Hernandez i never said to be intolerant...,Non_hope_speech
2,@Firstlast300 Wow An opinion is that I don't l...,Non_hope_speech
3,WOW!!!!!!!That was so so inspiring and incredi...,Hope_speech
4,@FALC0n Yea sorry I know Asian is an ethnicit...,Non_hope_speech
...,...,...
2835,Such fake sentiment. .,Non_hope_speech
2836,@A G black lives arent undervalued compared to...,Non_hope_speech
2837,People who pulled it down can and will be arre...,Non_hope_speech
2838,@Aaron Castellanos It will be a two hour movie...,Non_hope_speech


In [None]:
# Step - a : Remove blank rows if any.
Corpus1['Text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus1['Text'] = [entry.lower() for entry in Corpus1['Text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus1['Text']= [word_tokenize(entry) for entry in Corpus1['Text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus1['Text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus1.loc[index,'text_final'] = str(Final_words)

In [None]:
Corpus1

Unnamed: 0,Text,Label,text_final
0,"[@, generation, x, counting, money, that, she,...",Non_hope_speech,"['generation', 'x', 'count', 'money', 'give', ..."
1,"[@, paola, hernandez, i, never, said, to, be, ...",Non_hope_speech,"['paola', 'hernandez', 'never', 'say', 'intole..."
2,"[@, firstlast300, wow, an, opinion, is, that, ...",Non_hope_speech,"['wow', 'opinion', 'like', 'color']"
3,"[wow, !, !, !, !, !, !, !, that, was, so, so, ...",Hope_speech,"['wow', 'inspiring', 'incredible', 'speech', '..."
4,"[@, falc0n, yea, sorry, i, know, asian, is, an...",Non_hope_speech,"['yea', 'sorry', 'know', 'asian', 'ethnicity',..."
...,...,...,...
2835,"[such, fake, sentiment, ., .]",Non_hope_speech,"['fake', 'sentiment']"
2836,"[@, a, g, black, lives, arent, undervalued, co...",Non_hope_speech,"['g', 'black', 'live', 'arent', 'undervalue', ..."
2837,"[people, who, pulled, it, down, can, and, will...",Non_hope_speech,"['people', 'pull', 'arrest', 'cause', 'damage'..."
2838,"[@, aaron, castellanos, it, will, be, a, two, ...",Non_hope_speech,"['aaron', 'castellanos', 'two', 'hour', 'movie..."


In [None]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['Label'],test_size=0.3)

In [None]:
Train_X = Corpus['text_final']
Test_X = Corpus1['text_final']
Train_Y = Corpus['Label']
Test_Y = Corpus1['Label']

In [None]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [None]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [None]:
print(Tfidf_vect.vocabulary_)



In [None]:
print(Train_X_Tfidf)

  (0, 3632)	0.5181064004772656
  (0, 2685)	0.2450330630311686
  (0, 1820)	0.3713592380160023
  (0, 1479)	0.4711754958361254
  (0, 742)	0.5582208561759495
  (1, 4715)	0.41510241358128747
  (1, 2921)	0.4949240357738261
  (1, 2499)	0.39878769509795586
  (1, 1848)	0.3029858948196717
  (1, 1607)	0.5761146646555014
  (2, 2785)	0.6903990542010492
  (2, 2612)	0.7234287428339414
  (3, 4855)	0.09305618972422791
  (3, 4759)	0.16059089958458167
  (3, 4669)	0.12448210284901183
  (3, 4613)	0.1299017044376574
  (3, 4591)	0.13996879015886968
  (3, 4248)	0.2918440310865732
  (3, 4070)	0.15857348303659208
  (3, 3691)	0.17520221474603093
  (3, 3674)	0.10982229661417542
  (3, 3657)	0.18111366017415242
  (3, 3441)	0.11451997465588003
  (3, 3340)	0.14849156826296547
  (3, 2987)	0.1258043545809999
  :	:
  (22735, 2612)	0.2593806024991741
  (22735, 1545)	0.41872612203322823
  (22736, 4930)	0.34360148521983164
  (22736, 4810)	0.3847009365851902
  (22736, 3894)	0.530556932757717
  (22736, 2785)	0.24683960449692

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  91.1619718309859


In [None]:
predictions_SVM

(2840,)

In [None]:
Test_Y

(2840,)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(Test_Y, predictions_SVM, labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.66      0.16      0.26       272
           1       0.92      0.99      0.95      2568

    accuracy                           0.91      2840
   macro avg       0.79      0.58      0.61      2840
weighted avg       0.89      0.91      0.89      2840

