## Cleaning and Put to CSV file

In [1]:
import keras
import numpy as np
import pandas as pd
import os
import glob
import math
import re
df = tweets = pd.read_excel("./healthcareoutput.xlsx")
df = df[['TweetText','Polarity']]
def processRow(row):
    #Lower case
    tweet = row.lower()
    
    #delete any url
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    #delete any @Username
    tweet = re.sub('@[^\s]+','',tweet) #
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub('[\n]+', ' ', tweet)
    #Remove not alphanumeric symbols white spaces
    tweet = re.sub(r'[^\w]', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #Remove Digits
    tweet = re.sub(" \d+", '', tweet)
    tweet = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", tweet)
    #Remove :( or :)
    tweet = tweet.replace(':)','')
    tweet = tweet.replace(':(','')
    #trim
    tweet = tweet.strip('\'"')
    row = tweet
    return row

testing = np.array(list(df['TweetText'][:15000]))  
for x in range(0,testing.shape[0]):
    testing[x] = processRow(testing[x])
    
df_clean = df.copy()
df_clean.TweetText = testing
#df_clean.to_csv("clean_healthcaretweet.csv", index = False)

Using TensorFlow backend.


In [2]:
df

Unnamed: 0,TweetText,Polarity
0,"@plambrechtsen Fyi, you can do the same in ""Un...",0.0
1,China should be paying for the healthcare of a...,-1.0
2,@thetenderforlaw @megaholt There’s no lying ab...,1.0
3,"@ChrisMurphyCT Chris \nThe incompetence, fake ...",-1.0
4,THANK YOU TO OUR NURSES!!! THE HEALTHCARE WORK...,0.0
5,CDC analysis shows coronavirus poses serious r...,-1.0
6,Hackers Promise 'No More Healthcare Cyber Atta...,1.0
7,SA’s healthcare system has only around 3 000 c...,-1.0
8,We've got a two tiered healthcare system: one ...,1.0
9,"@republic Idiot, he told government,hospitals...",-1.0


## Naive Bayes and SVM

In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

#Set Random seed
np.random.seed(500)

# Add the Data using pandas
Corpus = pd.read_csv(r"clean_healthcaretweet.csv",encoding='latin-1')
Corpus['Polarity'] = Corpus['Polarity'].apply(str) #converts the float string into string/obj for processing
#print(Corpus.dtypes)

# Step - 1a : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['TweetText']= [word_tokenize(str(entry)) for entry in Corpus['TweetText']]
# Step - 1b: Perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

[nltk_data] Downloading package punkt to /home/hoang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hoang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hoang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
for index,entry in enumerate(Corpus['TweetText']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

print(Corpus['text_final'].head())

0         ['fyi', 'univeral', 'healthcare', 'germany']
1    ['china', 'pay', 'healthcare', 'get', 'disease...
2    ['lie', 'healthcare', 'work', 'healthcare', 's...
3    ['chris', 'incompetence', 'fake', 'empathy', '...
4    ['thank', 'nurse', 'healthcare', 'worker', 'th...
Name: text_final, dtype: object


In [5]:
# Step - 2: Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['Polarity'],test_size=0.2)

# Step - 3: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
# Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


In [6]:
# Step - 5: Now we can run different algorithms to classify out data check for accuracy

# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)


# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

Naive Bayes Accuracy Score ->  82.39999999999999
SVM Accuracy Score ->  82.43333333333334


In [17]:
set(Test_Y)

{0, 1, 2, 3}

In [18]:
set(predictions_SVM)

{0, 3}

In [8]:
result_dict = {'id': Test_X.id,
               'Text': df['TweetText'][df.index == Test_X.id],
               'target': SVM.predict(Test_X_Tfidf)}
submission = pd.DataFrame.from_dict(result_dict)
submission
# submission.to_csv('lr_submission.csv', index=False)

AttributeError: 'Series' object has no attribute 'id'