# Covid-19 : To vaccinate or not to vaccinate

#### Analysing social media sentiment towards vaccines

![alt text](https://images.financialexpress.com/2020/06/1-126.jpg)

In [237]:
import numpy as np 
import pandas as pd
import seaborn as sns
import re
import nltk 
import math
nltk.download('stopwords')

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, confusion_matrix


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [238]:
train = pd.read_csv("Train (2).csv")
test = pd.read_csv("Test (2).csv")

In [239]:
train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [240]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [241]:
test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [242]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5177 entries, 0 to 5176
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   5177 non-null   object
 1   safe_text  5176 non-null   object
dtypes: object(2)
memory usage: 81.0+ KB


In [243]:
# drop rows with null values 
train = train.dropna()

# drop tweetID and agreement columns
train = train.drop(['tweet_id', 'agreement'], axis=1)

In [244]:
# split train data into features and lables 
X = train['safe_text'].values
y = train['label'].values

In [245]:
def tweet_cleaner(tweets):

    """
    This function uses regular expressions to remove special characters, 
    punctuation, numbers and any extra white space from tweets 
    then converts everything to lowercase letters.

    Input:
    tweets: original tweet
           datatype: 'str'

    Output:
    clean_tweets: modified tweet
           datatype: 'str'
    """

    clean_tweets = []

    for sentence in range(0, len(tweets)):
      
        # Remove all the special characters
        processed_tweets = re.sub(r'\W', ' ', str(tweets[sentence]))

        # remove all single characters
        processed_tweets = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_tweets)

        # Remove single characters from the start
        processed_tweets = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_tweets) 

        # Substituting multiple spaces with single space
        processed_tweets = re.sub(r'\s+', ' ', processed_tweets, flags=re.I)

        # Converting to Lowercase
        processed_tweets = processed_tweets.lower()

        # Remove numbers
        processed_tweets = re.sub(r'\d+', '', processed_tweets)  

        # Remove <user> tags
        processed_tweets = re.sub(r'<.*?>', '', processed_tweets)

        # Replace &amp; with 'and'
        processed_tweets = re.sub(r"&amp;", "and", processed_tweets)  

        clean_tweets.append(processed_tweets)

    return clean_tweets

X = tweet_cleaner(X)
test_tweets = tweet_cleaner(test['safe_text'])

In [246]:
# Convert text to numbers

vectorizer = TfidfVectorizer(min_df=3,  
                             max_features=None, 
                             strip_accents='unicode', 
                             analyzer='word',token_pattern=r'\w{1,}',
                             ngram_range=(1, 3), 
                             use_idf=1,
                             smooth_idf=1,
                             sublinear_tf=1,
                             stop_words = 'english')

X = vectorizer.fit_transform(X).toarray()  

In [247]:
# Create test and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [248]:
lr = LogisticRegression(class_weight='balanced', max_iter=500).fit(X_train, y_train) 
val_predictions_proba = lr.predict_proba(X_test)

In [249]:
def probs(predictions):

  """
  This function converts the multiclass probability array into a single 
  value between -1 and 1 for each class (-1, 0, 1)
  This value is the probability of a tweet belonging to the most likely class

  input: preds
         datatype: array
  
  output: final_predictions
          datatype: array
  """

  final_predictions = []

  for pred in predictions:
    argmax = pred.argmax()

    if argmax == 0:
      final_predictions.append(-1*pred[0])

    elif argmax == 1:
      final_predictions.append(0)

    else:
      final_predictions.append(pred[2])

  return final_predictions

y_pred = probs(val_predictions_proba)

In [250]:
# Model evaluation
print(math.sqrt(mean_squared_error(y_test, y_pred)))

0.6011345497965763


In [252]:
# Submission

# Prepare test tweets for the model and predict labels
X_test = vectorizer.transform(test_tweets).toarray() 
y_pred = lr.predict_proba(X_test)
y_preds = probs(y_pred)

# Create submission file
submission = pd.DataFrame({"tweet_id":test['tweet_id'], "label":y_preds})
submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,tweet_id,label
0,00BHHHP1,-0.574569
1,00UNMD0E,0.706157
2,01AXPTJF,0.0
3,01HOEQJW,0.68603
4,01JUKMAO,-0.407879
