<a href="https://colab.research.google.com/github/Ruoro/cleaning_practice/blob/main/PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import the libraries.

In [None]:
import pandas as pd
import numpy as np
import re
import string

## methods and stopwords preprocessing libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import warnings 
warnings.filterwarnings("ignore")

## downlead a stopword set

In [None]:
import nltk 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load data 

In [None]:
df = pd.read_csv('/content/MentalHealth.new.csv')

In [None]:
# preview Data
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,user,tweet,location,description,friends_count,followers_count,statuses_count,created_at,retweet_count,hashtags,disorder
0,0,0,TheKenyanPost,SHOCK as a young man climbs on top of a hospit...,"Nairobi, Kenya","News, Politics, Entertainment, Gossip, Feature...",434,52951,356584,2021-08-17 10:33:55,0,[],depression
1,1,1,Goodguy_254,People tend to hide depression under drinking ...,"Nairobi, Kenya",poet\n writer🧾\nsoftware developer🖥️\n program...,2074,2257,1284,2021-08-17 10:20:30,0,[],depression
2,2,2,AlfredLete,@Lily_nganga Depression,Nairobi,"do what is right,not what is easy",8515,13075,37974,2021-08-17 09:32:49,0,[],depression
3,3,3,GeraldNgaoPk7,Itumbi has subjected bloggers in to depression...,"Nairobi, Kenya",Pan-Africanist||NeoMarxistRevolutionary||Inter...,21503,20437,164694,2021-08-17 08:56:26,2,"[{'text': 'ItumbiGhostWorkers', 'indices': [77...",depression
4,4,4,swyma304,Mental health isn’t just anxiety &amp; depress...,"Nairobi, Kenya",Psychiatry Resident. Certified Cognitive Behav...,93,206,594,2021-08-17 08:18:20,1,[],depression


Remain with only the relevant columns


In [None]:
# relevant columns are the tweet and disorder columns
df = df[['tweet', 'disorder']]
df.head()

Unnamed: 0,tweet,disorder
0,SHOCK as a young man climbs on top of a hospit...,depression
1,People tend to hide depression under drinking ...,depression
2,@Lily_nganga Depression,depression
3,Itumbi has subjected bloggers in to depression...,depression
4,Mental health isn’t just anxiety &amp; depress...,depression


## creating a test sample from the data

In [None]:
# crerating a test sample

test_df = df.sample(70)
print("initial df size : ", df.shape)
df.drop(test_df.index, inplace=True)
print("Train df size : ", df.shape)


initial df size :  (742, 2)
Train df size :  (672, 2)


## Functions to preprocess the data

- lemmatizing
- stemming
- to lowercase
- remove punctuation



In [None]:
def preprocessing_task (tweet):
  #convert to lower case
  tweet = tweet.lower()
  # remove any urls
  tweet = re.sub(r'http\S+|www\S+|https\S+', "", tweet, flags = re.MULTILINE)
  
  # Remove @ references and # from tweet
  tweet = re.sub(r"\@\w+|\#", "", tweet)

  # remove punctuation
  tweet = tweet.translate(str.maketrans("", "", string.punctuation))

  #remove stopwords
  tweet_token = word_tokenize(tweet)
  filtered_words = [word for word in tweet_token if word not in stop_words]

  # Stemming the words
  ps = PorterStemmer()
  stemmed_words = [ps.stem(w) for w in filtered_words]

  # lematizing
  lemmatizer = WordNetLemmatizer()
  lema_words = [lemmatizer.lemmatize(w, pos = "a") for w in stemmed_words]

  return " ". join(lema_words)

# Define function to remove emojis from the tweets
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


In [None]:
def finish(string):
    return preprocessing_task(remove_emoji(string))

df['clean_text'] = df['tweet'].apply(lambda x: finish(x))
df.head()

Unnamed: 0,tweet,disorder,clean_text
0,SHOCK as a young man climbs on top of a hospit...,depression,shock young man climb top hospit build kahawa ...
1,People tend to hide depression under drinking ...,depression,peopl tend hide depress drink drug take time f...
2,@Lily_nganga Depression,depression,depress
3,Itumbi has subjected bloggers in to depression...,depression,itumbi subject blogger depress ought behind ba...
4,Mental health isn’t just anxiety &amp; depress...,depression,mental health ’ anxieti amp depress ’ nag head...


In [None]:
# Convert to list in order to preprocessing
x = df.clean_text.to_list()
x[:5]

['shock young man climb top hospit build kahawa wendani threaten commit suicid – depre…',
 'peopl tend hide depress drink drug take time find one talk',
 'depress',
 'itumbi subject blogger depress ought behind bar itumbighostwork',
 'mental health ’ anxieti amp depress ’ nag headach due ongo frustrat sharp p…']

In [None]:
df.isna().sum()

tweet         0
disorder      0
clean_text    0
dtype: int64

In [None]:
df.disorder.value_counts()

depression       328
anxiety          248
suicidal          50
bipolar           16
schizophrenia     12
dementia          10
paranoia           8
Name: disorder, dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df.disorder = le.fit_transform(df.disorder)

df.disorder.value_counts()

3    328
0    248
6     50
1     16
5     12
2     10
4      8
Name: disorder, dtype: int64

In [None]:
illness =  {'depression' : 3,
         'anxiety': 0 , 
         'suicidal': 6, 
         'schizophrenia' : 5,
         'bipolar' : 1,
         'dimentia': 2,
         'paranoia': 4}
illness

{'anxiety': 0,
 'bipolar': 1,
 'depression': 3,
 'dimentia': 2,
 'paranoia': 4,
 'schizophrenia': 5,
 'suicidal': 6}

## Augmenting the text to get more data for testing our models


In [None]:
%%capture
!pip install nlpaug
!pip install transformers

In [None]:
# Augmentation Libraries
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
from tqdm import tqdm
from sklearn.utils import shuffle

from nlpaug.util import Action

In [None]:
#Split the train and test data
from sklearn.model_selection import train_test_split
train,valid=train_test_split(df,test_size=0.20 , stratify = df['disorder'])
train.shape, valid.shape

((537, 3), (135, 3))

In [None]:
# Check the number of each disorder
train['disorder'].value_counts()

3    262
0    198
6     40
1     13
5     10
2      8
4      6
Name: disorder, dtype: int64

In [None]:
text = train.iloc[0]['tweet']
text

'#UPNEXT \n@PhilTWorship tells his story: \ngoing through divorce, depression, and how finding music in those low mome… https://t.co/PS1455slaY'

In [None]:
# ContextualWordEmbsAug : Augmenter that apply operation (word level) to textual input based on contextual word embeddings.

aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)


print('Original text \n',text,'\n Augmented text\n', augmented_text)


Original text 
 #UPNEXT 
@PhilTWorship tells his story: 
going through divorce, depression, and how finding music in those low mome… https://t.co/PS1455slaY 
 Augmented text
 # upnext @ philtworship tells his internal story : going fresh through no divorce, raging depression, and the how finding music art in mastering those low quality mome … https : / / t.'co / 2011 ps1455slay


In [None]:
# Creating a copy of  the dataset
df1 = df.copy(deep=True)

In [None]:
import numpy as np
#For anxiety, class = 0,

# Creating augmented text data to increase our training dataset by 78 entries

def augment_text(df1,samples=78,pr=0.2):   
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==0].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':0})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 78/78 [02:22<00:00,  1.82s/it]

(615, 3) 







Unnamed: 0,tweet,disorder,clean_text
122,"That story of ""Death by instalments"" people go...",3,stori death instal peopl go lot depress hide m...
329,7 warning signs to spot depression and anxiety...,3,7 warn sign spot depress anxieti workplac help...
216,Menopil Plus contains Black Cohosh and Red Clo...,0,menopil plu contain black cohosh red clover ex...
326,@mmnjug @NationAfrica Depression is worsened b...,3,depress worsen use drug dupe think youv found ...
49,@makaumutua Fighting Ruto is suicidal,6,fight ruto suicid


In [None]:
#Bipolar class = 1

# Creating augmented text data to increase our training dataset by 289 entries

def augment_text(df1,samples=289,pr=0.2):  
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']== 1].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':1})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 289/289 [08:30<00:00,  1.77s/it]

(904, 3) 







Unnamed: 0,tweet,disorder,clean_text
549,@WakabiBrian19 We have good counselors at man ...,3,good counselor man unit case depress
100,If you're genetically predisposed to schizophr...,5,your genet predispos schizophrenia weed may in...
239,* Changes in physical and emotional reactions;...,6,chang physic emot reaction easili frighten neg...
785,@ ke _ news skimpyy @ allak _ live weda @ yaho...,1,
6,@ChiromoHospGrp @leujonson @benson_nduta @iank...,0,…


In [None]:
#For dimentia disorder, class = 2

# Creating augmented text data to increase our training dataset by 286 entries

def augment_text(df1,samples=290,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']== 2].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':2})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 290/290 [07:32<00:00,  1.56s/it]

(1194, 3) 







Unnamed: 0,tweet,disorder,clean_text
563,Great @MwalimChurchill @Rachelshebesh #Churchi...,3,great churchillshow alway big fan depress real...
771,weather is so very bipolar,1,
77,@ chiromohospgrp @ ac edmondagaba2 @ kkwilbroa...,0,
677,black icymi : from living away with complete b...,1,
348,"i hate being bipolar, ; it ’ s basically awesome.",1,


In [None]:
# class 3 

# Creating augmented text data to increase our training dataset by 286 entries

def augment_text(df1,samples=47,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==3].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':3})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 47/47 [01:17<00:00,  1.65s/it]

(1241, 3) 







Unnamed: 0,tweet,disorder,clean_text
1058,@ iqbalhajji + yes this city is bipolar 1 even,1,
902,bipolar weather for maybe slightly almost half...,1,
54,@crazy_kennar When history books will be writt...,3,histori book written get special mention cover...
751,dementia : connecting the from ‘ single most c...,2,
934,when we seriously attempt. to convince people ...,2,


In [None]:
#For paranoia = 4

# Creating augmented text data to increase our training dataset by 286 entries

def augment_text(df1,samples=293,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']== 4 ].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':4})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 293/293 [06:54<00:00,  1.41s/it]

(1534, 3) 







Unnamed: 0,tweet,disorder,clean_text
241,how do - you guys deal with anxiety and stress...,0,
192,sundays even have my actual anxiety rates at p...,0,
594,"Btw Struggle to get Money, Fame and Exposure w...",3,btw struggl get money fame exposur earn depres...
1504,@ ke _ le karenn miss weed causes paranoia whi...,4,
176,said raila just called everyone real who is ho...,2,


In [None]:
#For class 5 = schizophrenia

# Creating augmented text data to increase our training dataset by 286 entries

def augment_text(df1,samples=290,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']== 5 ].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':5})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 290/290 [10:30<00:00,  2.18s/it]

(1824, 3) 







Unnamed: 0,tweet,disorder,clean_text
552,@ ke _ karenn weed also causes paranoia which ...,4,
781,when we attempt solely to convince black peopl...,2,
1794,@ _ antohlibra schizophrenia transmission has ...,5,
1089,when we attempt harder to convince very people...,2,
1190,@ ke _ karenn weed commonly causes paranoia we...,4,


In [None]:
#For class 6 = suicidal

# Creating augmented text data to increase our training dataset by 286 entries

def augment_text(df1,samples=255,pr=0.2):  
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']== 6 ].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':6})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

In [None]:
# Check the number of each disorder
train['disorder'].value_counts()

3    309
1    302
5    300
4    299
2    298
6    295
0    276
Name: disorder, dtype: int64

In [None]:
train.to_csv('Augmented_Data.csv', index=False)

In [None]:
train = pd.read_csv('/content/Augmented_Data.csv')
train

Unnamed: 0,tweet,disorder,clean_text
0,this a bipolar changing weather policy is quit...,1,
1,@ bitchrespecter paranoia. ndio _ utamu... tru...,4,
2,Depression is real. Please check on your frien...,3,depress real pleas check friend especi one alw...
3,@ yc _ kenya @ beach chamadui @ nayakenya @ ay...,6,
4,guess i will commit i twitter suicidal if tomo...,6,
...,...,...,...
2074,nuclear power @ generation - # nextgen - modul...,4,
2075,this may also sound demeaning to people suffer...,2,
2076,boda guys wanakupea life guides appear while m...,6,
2077,@ChiromoHospGrp @EdmondAgaba2 @KKwilbroad @ali...,3,…


In [None]:
def finish(string):
    return preprocessing_task(remove_emoji(string))

train['clean_text'] = train['tweet'].apply(lambda x: finish(x))
train.head()

Unnamed: 0,tweet,disorder,clean_text
0,this a bipolar changing weather policy is quit...,1,bipolar chang weather polici quit annoy yaani ...
1,@ bitchrespecter paranoia. ndio _ utamu... tru...,4,bitchrespect paranoia ndio utamu true word sup...
2,Depression is real. Please check on your frien...,3,depress real pleas check friend especi one alw...
3,@ yc _ kenya @ beach chamadui @ nayakenya @ ay...,6,yc kenya beach chamadui nayakenya ayarhep keny...
4,guess i will commit i twitter suicidal if tomo...,6,guess commit twitter suicid tomorrow ’ get 6 1...


In [None]:
#  Defining the features and labels

X = df['clean_text']
y = df['disorder']

# Splitting the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
# “Term Frequency — Inverse Document Frequency”. This is a technique to 
# quantify a word in documents, we generally compute a weight to each word
#  which signifies the importance of the word in the document and corpus.

# Instancetiate
tfidf = TfidfVectorizer(stop_words='english')


In [None]:
# Fit the tfidf to the data

tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
X_train_tfidf


<537x1693 sparse matrix of type '<class 'numpy.float64'>'
	with 3863 stored elements in Compressed Sparse Row format>

In [None]:
# MultinomialNB

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# Making predictions
y_pred = nb.predict(X_test_tfidf)


In [None]:
from sklearn.metrics import classification_report
classification_report(y_pred, y_test)


'              precision    recall  f1-score   support\n\n           0       0.62      0.67      0.64        43\n           1       0.00      0.00      0.00         0\n           3       0.85      0.70      0.77        92\n           5       0.00      0.00      0.00         0\n           6       0.00      0.00      0.00         0\n\n    accuracy                           0.69       135\n   macro avg       0.29      0.27      0.28       135\nweighted avg       0.78      0.69      0.73       135\n'