In [1]:
# Import required packages
import numpy as np
import pandas as pd
import logging
import json
import string
import re
from pprint import pprint
import matplotlib.pyplot as plt
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from hatesonar import Sonar

import warnings
warnings.filterwarnings('ignore')

The deprecated function was deprecated in Matplotlib 3.4 and will be removed two minor releases later.
  from .axes3d import Axes3D


In [2]:
!pip install lightgbm


[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip




In [3]:
#Define stopwords
stop_words = set(stopwords.words('english'))
#You can update the stop words from here (I added "re" because it's originally "are")
stop_words.update(["governer","re","governor","watch","live","facebook","beshear","hutchinson","lousiana","youtube","mercer","middlesex","monmouth","morris","ocean","passaic", "salem","somerset", "sussex", "union","warren","update","briefing","press","conference","updates","et","news"])

In [4]:
def common_member(a, b):
    a_set = set(a)
    b_set = set(b)
    if (a_set & b_set):
        return True 
    else:
        return False

In [6]:
def clean_text(text):
    '''
    Make text lowercase, 
    remove reply, 
    remove text in square brackets, 
    remove links, 
    remove user mention,
    remove punctuation, 
    remove numbers 
    and remove words containing numbers.
    '''
    
    re_url = r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?' 
    text = text.lower()
    text = re.sub('^rt', ' ', text)
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub(re_url, ' ', text)
    text = re.sub('<.*?>+', ' ', text) #In this part the <u> </u> will be removed and any similar HTML balise too 
    text = re.sub('@\w+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    
    return text

In [7]:
# Read the data
df = pd.read_excel("Raw Tweets.xlsx")

# Add the retweet column 
df['Retweet'] = ['Yes' if "RT @" in i else 'No' for i in df['Tweets'] ]

# Apply primary cleaning
df['Tweets_cleaned'] = df['Tweets'].apply(clean_text)

df['Tweets_cleaned'] = df['Tweets_cleaned'].str.split() \
    .apply(lambda x: [word for word in x if word not in stop_words]) \
    .apply(lambda x: ' '.join(x))

df.head(5)

Unnamed: 0,State,Role,Name,Tweets,num_date,Party,Status,Retweet,Tweets_cleaned
0,Minnesota,Senator,Amy Klobuchar,Seriously? Maybe the President should focus o...,2020-03-14,Democrat,4,No,seriously maybe president focus getting testin...
1,Minnesota,Senator,Amy Klobuchar,There are so many heroes on the front lines as...,2020-03-14,Democrat,4,No,many heroes front lines face pandemic nurses d...
2,Minnesota,Senator,Amy Klobuchar,RT @Yamiche : This is why @JudyWoodruff is ama...,2020-03-14,Democrat,4,Yes,amazing reminded viewers tonight think others ...
3,Minnesota,Senator,Amy Klobuchar,With the House passing the #FamiliesFirstBill ...,2020-03-14,Democrat,4,No,house passing familiesfirstbill last night sen...
4,Minnesota,Senator,Amy Klobuchar,Seniors are more at risk to the coronavirus pa...,2020-03-14,Democrat,4,No,seniors risk coronavirus pandemic cities like ...


In [8]:
# Vectorize this dataset using the same vocabulary used in the previous model

X = df['Tweets_cleaned']

# Load previous model vocabulary
Model_vocab = joblib.load('vocabulary.pickle') 

# Apply the vectorizer on the dataset
Tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), lowercase = True,
                          use_idf = True ,max_features = 5000000, vocabulary = Model_vocab)
X = Tf.fit_transform(X)
X

<61466x340198 sparse matrix of type '<class 'numpy.float64'>'
	with 1112932 stored elements in Compressed Sparse Row format>

## Religion analysis

In [9]:
# Using a trained model LGBM with 99% using a public dataset

Model_ = joblib.load('classifier.pickle')
Religion_predictions = Model_.predict(X)
Religion_percentages = Model_.predict_proba(X)

In [10]:
no = []
yes = []
for i in Religion_percentages :
    no.append(round(i[0],2)*100)
    yes.append(round(i[1],2)*100)

In [11]:
def relabel(i):
    if i== 1:
        return('Yes')
    else : 
        return('No')
df['Religion'] = Religion_predictions
df['Religion'] = df['Religion'].apply(relabel)
df['Religion_No'] = no
df['Religion_Yes'] = yes
df

Unnamed: 0,State,Role,Name,Tweets,num_date,Party,Status,Retweet,Tweets_cleaned,Religion,Religion_No,Religion_Yes
0,Minnesota,Senator,Amy Klobuchar,Seriously? Maybe the President should focus o...,2020-03-14,Democrat,4,No,seriously maybe president focus getting testin...,No,97.0,3.0
1,Minnesota,Senator,Amy Klobuchar,There are so many heroes on the front lines as...,2020-03-14,Democrat,4,No,many heroes front lines face pandemic nurses d...,No,99.0,1.0
2,Minnesota,Senator,Amy Klobuchar,RT @Yamiche : This is why @JudyWoodruff is ama...,2020-03-14,Democrat,4,Yes,amazing reminded viewers tonight think others ...,No,96.0,4.0
3,Minnesota,Senator,Amy Klobuchar,With the House passing the #FamiliesFirstBill ...,2020-03-14,Democrat,4,No,house passing familiesfirstbill last night sen...,No,100.0,0.0
4,Minnesota,Senator,Amy Klobuchar,Seniors are more at risk to the coronavirus pa...,2020-03-14,Democrat,4,No,seniors risk coronavirus pandemic cities like ...,No,73.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...
61461,Texas,Senator,Ted Cruz,Exactly the right move. Last year @SenBillCass...,2020-05-31,Republican,1,No,exactly right move last year amp called design...,No,98.0,2.0
61462,South Carolina,Senator,Tim Scott,RT @postandcourier : It’s 11:00. A county-wide...,2020-05-31,Republican,1,Yes,it’s county wide curfew effect charleston,No,99.0,1.0
61463,South Carolina,Senator,Tim Scott,RT @FoxNewsSunday : EXCLUSIVE: Chris will be j...,2020-05-31,Republican,1,Yes,exclusive chris joined senator tim scott discu...,No,99.0,1.0
61464,South Carolina,Senator,Tim Scott,For those who believe that violence is a way t...,2020-05-31,Republican,1,No,believe violence way react selfishness tolerat...,No,99.0,1.0


## Sentiment analysis

In [12]:
# Import the lexicon 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
# Download the lexicon
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\wajih\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [13]:
sentiments = []
neg = []
pos = []
neu = []
analyzer = SentimentIntensityAnalyzer()

for tw in df['Tweets_cleaned']:
    vs = analyzer.polarity_scores(tw)
    del vs['compound']
    max_value = max(vs, key=vs.get)
    labels = {'neg': "negative", 'neu': "neutral", 'pos': "positive"}
    neg.append(vs['neg']*100)
    neu.append(vs['neu']*100)
    pos.append(vs['pos']*100)
    sentiments.append(labels[max_value])
    
df['Sentiments'] = sentiments
df['Negative'] = neg
df['Positive'] = pos
df['Neutral'] = neu
df

In [None]:
df.to_excel(r'Labeled_Tweets_By_Religion .xlsx', index = False, header = True)