# Machine Learning Approach for Depression Prediction Through Tweets

In [1]:
#!/usr/bin/env python3
import numpy as np
import pandas as pd

# load dataset

In [2]:
data = pd.read_csv("sentiment_tweets3.csv")

# Preprocessing

In [3]:
data.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [4]:
# change long column names to short one
data.rename(columns={'message to examine':'Message',
                     'label (depression result)':'Label'}, inplace= True)

In [5]:
data.head()

Unnamed: 0,Index,Message,Label
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [6]:
# basically index a non-informative parameter, so we can drop index column
data.drop(columns=["Index"], axis= 1, inplace= True)

In [7]:
# yes, we didi it, lets check new data
data.head()

Unnamed: 0,Message,Label
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga http://plurk.com/p/mzp1e,0
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [8]:
# lets take a view of data fromall possible ways
data.shape

(10314, 2)

In [9]:
#we have 10314 row and 2 columns

In [10]:
data.describe()

Unnamed: 0,Label
count,10314.0
mean,0.224355
std,0.417177
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 2 columns):
Message    10314 non-null object
Label      10314 non-null int64
dtypes: int64(1), object(1)
memory usage: 161.2+ KB


In [12]:
data.count()

Message    10314
Label      10314
dtype: int64

In [13]:
data['Label'].value_counts()

0    8000
1    2314
Name: Label, dtype: int64

In [14]:
# processing

In [15]:
# import all required libraries here
! pip install nltk


[33mDEPRECATION: Python 3.5 reached the end of its life on September 13th, 2020. Please upgrade your Python as Python 3.5 is no longer maintained. pip 21.0 will drop support for Python 3.5 in January 2021. pip 21.0 will remove support for this functionality.[0m
Defaulting to user installation because normal site-packages is not writeable


In [16]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
# imported regular expressions
# object stopwords is not callable here to create object
ps = PorterStemmer()



In [17]:
#Create a empty corpus to hold vector value


In [18]:
# Lets start processing
corpus = []
for i in range(0, data.shape[0]):
#    if i < 10:
#        print("\n",data['Message'][i])
    msg = data["Message"][i]
    # email addess
    msg = re.sub("\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", "emailaddr", data["Message"][i])
    
    
    # url 
    
    msg = re.sub("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", "url", data['Message'][i])
    
    # Mobile no
    
    msg = re.sub("/^(\+\d{1,3}[- ]?)?\d{10}$/","mobile", data['Message'][i])
    
    # Number
    
    msg = re.sub("\d+(\.\d+)?","number",data['Message'][i])
    #Phone number
    msg = re.sub("\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b","phoneno",data['Message'][i])
    
    # punctuation
    
    msg = re.sub("[^\w\d\s]"," ", data['Message'][i])
    
    # convert to lower case
    
    msg = msg.lower()
    # craete tokens using split
    msg = msg.split()
    #print only for 10 messages
    #if i < 10:
        #print("Message-After Split: ",msg)
    
    # apply stemming on it to remove stopwords
    # porter stemmer
    
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    
    #print("Message-After Stemming: ",msg)
    # preparing message with remaining tokens
    msg = " ".join(msg)
    
    # preparing a corpus
    
    corpus.append(msg)
print(corpus)

['real good moment missssssssss much', 'read manga http plurk com p mzp1e', 'comeagainjen http twitpic com 2y2lx http www youtub com watch v zogfqvh2me8', 'lapcat need send em account tomorrow oddli even refer tax support evid though', 'add myspac myspac com lookthund', 'sleepi good time tonight though', 'silkcharm nbn someon alreadi said fiber home mean least regular', '23 24ï ½c possibl today nice', 'nite twittervil workout ciao', 'danann night darlin sweet dream', 'good morn everybodi', 'final creat wordpress blog alreadi blog seattl coffe commun http tinyurl com c5uufd', 'kisha cnt get u til get frm u rememb ur top', 'nicolerichi ye rememb band awesom pleas repli', 'realli love reflect shadow', 'blueaero ooo fantasi like fantasi novel check', 'rokchic28 prob sell noth blog http snedwan com get listen band itun', 'shipovalov quot nokla connect peopl quot', 'stay late start earli good thing like job', 'kal_penn read new job congratul fantast', 'abl sleep think watch ugli betti onlin'

In [19]:
# Prepare a vector using countvectorizer

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(corpus)
sparse_input = cv.fit_transform(corpus)

# during model implementation we need data in dense format and current input is a sparse data
# convert it to array
input = sparse_input.toarray()

In [20]:
input

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [21]:
data.head()

Unnamed: 0,Message,Label
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga http://plurk.com/p/mzp1e,0
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [22]:
# select output i.e Label 

output = data.iloc[:,-1]

In [23]:
output.head()

0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

In [24]:
# we have input and output ready, now split data in input and output sets
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(input, output, test_size=0.2, random_state=0)

In [25]:
xtrain.shape

(8251, 19327)

In [26]:
ytrain.shape

(8251,)

In [27]:
xtest.shape

(2063, 19327)

In [28]:
ytest.shape

(2063,)

In [35]:
# Create and implement a model for prediction
from sklearn.naive_bayes import GaussianNB
model_gnb = GaussianNB()
# fit and predict values
model_gnb.fit(xtrain,ytrain)
pred = model_gnb.predict(xtest)

# Lets chec accuracy of model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc = accuracy_score(ytest,pred)
cm = confusion_matrix(ytest, pred)
cr = classification_report(ytest, pred)

print("Accuracy:\n ",acc)
print("Confusion Matrix:\n ",cm)
print("Classification rep[ort]:\n ",cr)

Accuracy:
  0.6204556471158507
Confusion Matrix:
  [[850 723]
 [ 60 430]]
Classification rep[ort]:
                precision    recall  f1-score   support

           0       0.93      0.54      0.68      1573
           1       0.37      0.88      0.52       490

    accuracy                           0.62      2063
   macro avg       0.65      0.71      0.60      2063
weighted avg       0.80      0.62      0.65      2063



In [36]:
# Create and implement a model for prediction
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier()
# fit and predict values
model_dt.fit(xtrain,ytrain)
pred = model_dt.predict(xtest)

# Lets chec accuracy of model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc = accuracy_score(ytest,pred)
cm = confusion_matrix(ytest, pred)
cr = classification_report(ytest, pred)

print("Accuracy:\n ",acc)
print("Confusion Matrix:\n ",cm)
print("Classification rep[ort]:\n ",cr)

Accuracy:
  0.9970916141541445
Confusion Matrix:
  [[1573    0]
 [   6  484]]
Classification rep[ort]:
                precision    recall  f1-score   support

           0       1.00      1.00      1.00      1573
           1       1.00      0.99      0.99       490

    accuracy                           1.00      2063
   macro avg       1.00      0.99      1.00      2063
weighted avg       1.00      1.00      1.00      2063



In [37]:
# Create and implement a model for prediction
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()
# fit and predict values
model_rf.fit(xtrain,ytrain)
pred = model_rf.predict(xtest)

# Lets chec accuracy of model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc = accuracy_score(ytest,pred)
cm = confusion_matrix(ytest, pred)
cr = classification_report(ytest, pred)

print("Accuracy:\n ",acc)
print("Confusion Matrix:\n ",cm)
print("Classification rep[ort]:\n ",cr)

Accuracy:
  0.9966068831798351
Confusion Matrix:
  [[1572    1]
 [   6  484]]
Classification rep[ort]:
                precision    recall  f1-score   support

           0       1.00      1.00      1.00      1573
           1       1.00      0.99      0.99       490

    accuracy                           1.00      2063
   macro avg       1.00      0.99      1.00      2063
weighted avg       1.00      1.00      1.00      2063

