In [1]:
import nltk
import random
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from textblob.classifiers import NaiveBayesClassifier
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
pd.set_option('display.max_columns', 10000)
##Explore this blog: https://jakevdp.github.io/PythonDataScienceHandbook/05.08-random-forests.html for visualisation ideas

In [2]:
#Loads rated tweets into dataframe
df = pd.read_excel("C:/Users/ertur/Documents/Work/Workwork/ARUK/Submission - JMIR Aging/Revisions/Categorised tweets 1500.xlsx", converters={'Tweet':str,'Theme':int})

In [3]:
df = df.rename(columns = {'Tweet':'body_text', 'Theme':'label'})

In [4]:
df.shape

(1500, 2)

In [5]:
#removing cases where rating is missing
df = df.dropna()
df.shape

(1497, 2)

In [6]:
#obtaining sentiment and subjectivity
def sentAnal(df):
    for index, row in df.iterrows():
        temp = TextBlob(row['body_text'])
        df.loc[index,'Sentiment'] = temp.sentiment.polarity
        df.loc[index,'Subjectivity'] = temp.sentiment.subjectivity
    return df

In [7]:
df = sentAnal(df)

In [8]:
#removing tweets rated as uncertain or unknown
themes=[1,2,3,4,5,6]
df = df[df.label.isin(themes)]
df.shape

(1414, 4)

In [9]:
#converting assigned themes into corresponding rating of stigmatising and non-stigmatising
theme_map = {1:0, 2:0, 3:0, 4:1, 5:1, 6:1}
df['stig_label'] = df.label.map(theme_map)
df = df.drop('label', axis = 1)

In [10]:
#literature defined features are generated
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

# Average Word Length. simply take the sum of the length of all the words and divide it by the total length of the tweet as defined in function above
df['avg_word'] = df['body_text'].apply(lambda x: avg_word(x))

# Number of Words in tweet
df['word_count'] = df['body_text'].apply(lambda x: len(str(x).split(" ")))

# Number of characters. Here, we calculate the number of characters in each tweet. This is done by calculating the length of the tweet.
df['char_count'] = df['body_text'].str.len() ## this also includes spaces

# number of special characters like hashtags. we make use of the ‘starts with’ function because hashtags (or mentions) always appear at the beginning of a word.
df['hastags'] = df['body_text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))

# number of numerics in tweet
df['numerics'] = df['body_text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

# number of UPPERCASE words. Anger or rage is quite often expressed by writing in UPPERCASE words which makes this a necessary operation to identify those words.
df['upper'] = df['body_text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))


In [11]:
#care-partner defined features are generated
#senile
Search_for_These_values = ['senile', 'SENILE'] 
pattern = '|'.join(Search_for_These_values) 
df['senile'] = df['body_text'].str.contains(pattern)
df['senile'] = df['senile'].map({True: 1, False: 0})
#demented
Search_for_These_values = ['demented', 'DEMENTED'] 
pattern = '|'.join(Search_for_These_values) 
df['demented'] = df['body_text'].str.contains(pattern)
df['demented'] = df['demented'].map({True: 1, False: 0})
#donald trump
Search_for_These_values = ['donald', 'trump', 'DONALD', 'TRUMP', '@realDonaldTrump'] 
pattern = '|'.join(Search_for_These_values) 
df['donaldtrump'] = df['body_text'].str.contains(pattern)
df['donaldtrump'] = df['donaldtrump'].map({True: 1, False: 0})
#nancypelosi
Search_for_These_values = ['nancy', 'pelosi', 'NANCY', 'PELOSI'] 
pattern = '|'.join(Search_for_These_values) 
df['nancypelosi'] = df['body_text'].str.contains(pattern)
df['nancypelosi'] = df['nancypelosi'].map({True: 1, False: 0})
#deranged
Search_for_These_values = ['DERANGED', 'deranged'] 
pattern = '|'.join(Search_for_These_values) 
df['DERANGED'] = df['body_text'].str.contains(pattern)
df['DERANGED'] = df['DERANGED'].map({True: 1, False: 0})
#cafe
Search_for_These_values = ['cafe', 'CAFE'] 
pattern = '|'.join(Search_for_These_values) 
df['Cafe'] = df['body_text'].str.contains(pattern)
df['Cafe'] = df['Cafe'].map({True: 1, False: 0})
#insane
Search_for_These_values = ['INSANE', 'insane'] 
pattern = '|'.join(Search_for_These_values) 
df['Insane'] = df['body_text'].str.contains(pattern)
df['Insane'] = df['Insane'].map({True: 1, False: 0})
#memory
Search_for_These_values = ['MEMORY', 'memory'] 
pattern = '|'.join(Search_for_These_values) 
df['Memory'] = df['body_text'].str.contains(pattern)
df['Memory'] = df['Memory'].map({True: 1, False: 0})
#research
Search_for_These_values = ['research', 'RESEARCH'] 
pattern = '|'.join(Search_for_These_values) 
df['Research'] = df['body_text'].str.contains(pattern)
df['Research'] = df['Research'].map({True: 1, False: 0})
#imbecile
Search_for_These_values = ['imbecile', 'IMBECILE'] 
pattern = '|'.join(Search_for_These_values) 
df['Imbecile'] = df['body_text'].str.contains(pattern)
df['Imbecile'] = df['Imbecile'].map({True: 1, False: 0})
#loon
Search_for_These_values = ['loon', 'LOON'] 
pattern = '|'.join(Search_for_These_values) 
df['Loon'] = df['body_text'].str.contains(pattern)
df['Loon'] = df['Loon'].map({True: 1, False: 0})
#crazy
Search_for_These_values = ['crazy', 'CRAZY'] 
pattern = '|'.join(Search_for_These_values) 
df['Crazy'] = df['body_text'].str.contains(pattern)
df['Crazy'] = df['Crazy'].map({True: 1, False: 0})
#looney bin
Search_for_These_values = ['looney', 'bin', 'LOONEY', 'BIN'] 
pattern = '|'.join(Search_for_These_values) 
df['Looney_Bin'] = df['body_text'].str.contains(pattern)
df['Looney_Bin'] = df['Looney_Bin'].map({True: 1, False: 0})
#lunatic
Search_for_These_values = ['lunatic', 'LUNATIC'] 
pattern = '|'.join(Search_for_These_values) 
df['Lunatic'] = df['body_text'].str.contains(pattern)
df['Lunatic'] = df['Lunatic'].map({True: 1, False: 0})
#unhinged
Search_for_These_values = ['unhinged', 'UNHINGED'] 
pattern = '|'.join(Search_for_These_values) 
df['Unhinged'] = df['body_text'].str.contains(pattern)
df['Unhinged'] = df['Unhinged'].map({True: 1, False: 0})
#senility
Search_for_These_values = ['senility', 'SENILITY'] 
pattern = '|'.join(Search_for_These_values) 
df['Senility'] = df['body_text'].str.contains(pattern)
df['Senility'] = df['Senility'].map({True: 1, False: 0})
# URL
Search_for_These_values = ['https'] 
pattern = '|'.join(Search_for_These_values) 
df['Link'] = df['body_text'].str.contains(pattern)
df['Link'] = df['Link'].map({True: 1, False: 0})
#caregiver
Search_for_These_values = ['caregiver', 'CAREGIVER'] 
pattern = '|'.join(Search_for_These_values) 
df['Caregiver'] = df['body_text'].str.contains(pattern)
df['Caregiver'] = df['Caregiver'].map({True: 1, False: 0})

In [12]:
df.shape

(1414, 30)

In [13]:
cols = df[df.columns.difference(["stig_label", "body_text"])].columns

In [17]:

from sklearn.feature_selection import SelectFwe
df_new = SelectFwe(alpha=0.05).fit_transform(df[cols], df['stig_label'])
df_new.shape

(1414, 17)

In [18]:


fixer = {}
for number in range(17):
    fixer[number] = []
for row in df_new:
    place = 0
    for column in row: 
        fixer[place].append(float(column))
        place += 1
        

        
        

In [19]:
featurechecker = df[df.columns.difference(["stig_label", "body_text"])]

In [20]:
kept_features =[]
for column in featurechecker:
    iterable = [float(i) for i in featurechecker[column]]
    samesofar = 0
    for othercolumn in fixer:
        if samesofar == 1414:
            kept_features.append(column)
            break
        else:
            for number in range(1414):
                if iterable[number] == fixer[othercolumn][number]:
                    samesofar += 1
                    pass
                else:
                    samesofar = 0
                    break
            
                
   
        

In [22]:

kept_features.append('body_text')
kept_features.append('stig_label')
kept_features

['Caregiver',
 'Crazy',
 'Link',
 'Memory',
 'Research',
 'Senility',
 'Sentiment',
 'avg_word',
 'body_len',
 'char_count',
 'demented',
 'donaldtrump',
 'hastags',
 'numerics',
 'punct%',
 'senile',
 'body_text',
 'stig_label']