In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import pickle

In [2]:
import nltk
nltk.download('stopwords')
from nltk.util import pr
stemmer = nltk.SnowballStemmer('english')
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('./Depression_Text.csv')
df.head()

Unnamed: 0,text,class,scale
0,Ex Wife Threatening SuicideRecently I left my ...,depressed,4.0
1,i need helpjust help me im crying so hard,depressed,2.0
2,Honetly idkI dont know what im even doing here...,depressed,5.0
3,[Trigger warning] Excuse for self inflicted bu...,depressed,4.0
4,It ends tonight.I can’t do it anymore. \nI quit.,depressed,4.0


In [4]:
df = df[['text','scale']]

In [5]:
df = df.sample(frac = 1)

In [6]:
df.head()

Unnamed: 0,text,scale
271,I’ve tried to reach outI have tried countless ...,3.0
1626,Lost all future prospects and have been on the...,4.0
2746,its just not worth it anymoreShe never asks me...,5.0
942,Why are people mean too me..I try too be nice ...,3.0
4490,I have a rare disease that’s called crippling ...,1.0


In [7]:
df.groupby('scale').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
scale,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0.0,1402,1398,after 2 month i bought new tcl 32 inch led tv ...,3
1.0,1522,1522,I have a rare disease that’s called crippling ...,1
2.0,820,820,Going to kill myself tonightNot even sure why ...,1
3.0,1078,1078,I’ve tried to reach outI have tried countless ...,1
4.0,883,883,Lost all future prospects and have been on the...,1
5.0,2087,2087,its just not worth it anymoreShe never asks me...,1
6.0,4,4,Weird DayI slept in today and missed a few cal...,1


In [8]:
df = df.replace(6,5)

In [9]:
df =  df.fillna(0)

In [10]:
df.groupby('scale').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
scale,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0.0,1404,1400,after 2 month i bought new tcl 32 inch led tv ...,3
1.0,1522,1522,I have a rare disease that’s called crippling ...,1
2.0,820,820,Going to kill myself tonightNot even sure why ...,1
3.0,1078,1078,I’ve tried to reach outI have tried countless ...,1
4.0,883,883,Lost all future prospects and have been on the...,1
5.0,2091,2091,its just not worth it anymoreShe never asks me...,1


In [11]:
import re

In [12]:
def clean(text):
  text = str(text).lower()
  text = re.sub('\[.*?\]','',text)
  text = re.sub('https?://\S+|www\.\S+','',text)
  text = re.sub('<.*?>+','',text)
  text = re.sub('[%s]' % re.escape(string.punctuation),'',text)
  text = re.sub('\n','',text)
  text = re.sub('\w*\d\w*','',text)
  text = [word for word in text.split(' ') if word not in stopword] 
  text = " ".join(text)
  text = [stemmer.stem(word) for word in text.split(' ')]
  text = " ".join(text)
  return text

df['text'] = df["text"].apply(clean)
print(df.head())

                                                   text  scale
271   i'v tri reach outi tri countless time tell fri...    3.0
1626  lost futur prospect edg past daystldr titlehel...    4.0
2746  worth anymoresh never ask what wrong anymor gu...    5.0
942   peopl mean mei tri nice good person realli chi...    3.0
4490  rare diseas that call crippl depress cure awar...    1.0


### DecisionTreeClassifier

In [13]:
x = np.array(df['text'])
y = np.array(df['scale'])

cv = CountVectorizer()
x = cv.fit_transform(x)
X_train , X_test , y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [14]:
y_pred = clf.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score , f1_score , precision_score , recall_score , confusion_matrix, ConfusionMatrixDisplay , RocCurveDisplay 
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test,name="Actual"),pd.Series(y_pred,name="Predicted")))
def get_metrics(y_test,y_predicted):
    accuracy = accuracy_score(y_test,y_predicted)
    precision = precision_score(y_test,y_predicted,average="weighted")
    recall = recall_score(y_test,y_predicted,average="weighted")
    f1 = f1_score(y_test,y_predicted,average="weighted")
    return accuracy , precision,recall,f1
acuracy,precision,recall,f1 = get_metrics(y_test,y_pred)
print("accuracy = %.3f \nprecision =%.3f \nrecall =%.3f \nf1 =%.3f" % (acuracy,precision,recall,f1))

Confusion matrix

Predicted  0.0  1.0  2.0  3.0  4.0  5.0
Actual                                 
0.0        285   94    2   11    9   19
1.0        105  239   16   30   13   49
2.0          6   25   30   45   39  108
3.0         20   29   45   54   49  122
4.0         10   27   38   44   39   98
5.0         30   84   74  116   82  254
accuracy = 0.385 
precision =0.371 
recall =0.385 
f1 =0.377


In [None]:
cm = confusion_matrix(y_test,y_pred)
cmp = ConfusionMatrixDisplay(cm,display_labels=['non-depressed','Scale 1' , 'Scale 2', 'Scale 3' , 'Scale 4' , 'Scale 5'])
cmp = cmp.plot(cmap="Blues")
plt.show()

### multinomial naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha = 0.1)
clf.fit(X_train,y_train)

In [19]:
y_pred = clf.predict(X_test)

In [20]:
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test,name="Actual"),pd.Series(y_pred,name="Predicted")))
acuracy,precision,recall,f1 = get_metrics(y_test,y_pred)
print("accuracy = %.3f \nprecision =%.3f \nrecall =%.3f \nf1 =%.3f" % (acuracy,precision,recall,f1))

Confusion matrix

Predicted  0.0  1.0  2.0  3.0  4.0  5.0
Actual                                 
0.0        249   92    8   11   12   48
1.0         79  259   20   21   18   55
2.0          3    4   28   47   46  125
3.0          7    7   44   57   61  143
4.0          4   16   32   54   44  106
5.0         18   17   92  111  118  284
accuracy = 0.394 
precision =0.408 
recall =0.394 
f1 =0.399


In [None]:
cm = confusion_matrix(y_test,y_pred)
cmp = ConfusionMatrixDisplay(cm,display_labels=['non-depressed','Scale 1' , 'Scale 2', 'Scale 3' , 'Scale 4' , 'Scale 5'])
cmp = cmp.plot(cmap="Blues")
plt.show()

In [None]:
from sklearn.model_selection import validation_curve

parameter_range = np.arange(0.1, 1, 0.1)

train_score, test_score = validation_curve(MultinomialNB(), x, y,
                                       param_name = "alpha",
                                       param_range = parameter_range,
                                        cv = 5, scoring = "accuracy")
 
# Calculating mean and standard deviation of training score
mean_train_score = np.mean(train_score, axis = 1)
std_train_score = np.std(train_score, axis = 1)
 
# Calculating mean and standard deviation of testing score
mean_test_score = np.mean(test_score, axis = 1)
std_test_score = np.std(test_score, axis = 1)
 
# Plot mean accuracy scores for training and testing scores
plt.plot(parameter_range, mean_train_score,
     label = "Training Accuracy", color = 'b')
plt.plot(parameter_range, mean_test_score,
   label = "Validation Accuracy", color = 'g')
 
# Creating the plot
plt.title("Accuracy Curve with Multinomial Naive Bayes Classifier")
plt.xlabel("alpha")
plt.ylabel("Accuracy")
plt.tight_layout()
plt.legend(loc = 'best')
plt.show() 

### KNN

In [23]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5,metric = 'minkowski',p = 2)
classifier.fit(X_train,y_train)

In [24]:
y_pred = classifier.predict(X_test)

In [25]:
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test,name="Actual"),pd.Series(y_pred,name="Predicted")))
acuracy,precision,recall,f1 = get_metrics(y_test,y_pred)
print("accuracy = %.3f \nprecision =%.3f \nrecall =%.3f \nf1 =%.3f" % (acuracy,precision,recall,f1))

Confusion matrix

Predicted  0.0  1.0  2.0  3.0  4.0  5.0
Actual                                 
0.0        285   44   73    5    2   11
1.0        192  127   68   17    2   46
2.0         44   40   27   31   29   82
3.0         58   73   37   30   36   85
4.0         58   38   23   27   20   90
5.0        118  114   83   73   62  190
accuracy = 0.290 
precision =0.273 
recall =0.290 
f1 =0.271


In [None]:
cm = confusion_matrix(y_test,y_pred)
cmp = ConfusionMatrixDisplay(cm,display_labels=['non-depressed','Scale 1' , 'Scale 2', 'Scale 3' , 'Scale 4' , 'Scale 5'])
cmp = cmp.plot(cmap="Blues")
plt.show()

In [None]:
parameter_range = np.arange(3, 6, 1)

train_score, test_score = validation_curve(KNeighborsClassifier(metric = 'minkowski',p = 2), x, y,
                                       param_name = "n_neighbors",
                                       param_range = parameter_range,
                                        cv = 5, scoring = "accuracy")
 
# Calculating mean and standard deviation of training score
mean_train_score = np.mean(train_score, axis = 1)
std_train_score = np.std(train_score, axis = 1)
 
# Calculating mean and standard deviation of testing score
mean_test_score = np.mean(test_score, axis = 1)
std_test_score = np.std(test_score, axis = 1)
 
# Plot mean accuracy scores for training and testing scores
plt.plot(parameter_range, mean_train_score,
     label = "Training Accuracy", color = 'b')
plt.plot(parameter_range, mean_test_score,
   label = "Validation Accuracy", color = 'g')
 
# Creating the plot
plt.title("Accuracy Curve with k_nearest_neighbors Classifier")
plt.xlabel("n neighbors")
plt.ylabel("Accuracy")
plt.tight_layout()
plt.legend(loc = 'best')
plt.show() 

### SVM

In [28]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf',random_state=0,probability=True)
classifier.fit(X_train,y_train)

In [29]:
y_pred = classifier.predict(X_test)

In [30]:
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test,name="Actual"),pd.Series(y_pred,name="Predicted")))
acuracy,precision,recall,f1 = get_metrics(y_test,y_pred)
print("accuracy = %.3f \nprecision =%.3f \nrecall =%.3f \nf1 =%.3f" % (acuracy,precision,recall,f1))

Confusion matrix

Predicted  0.0  1.0  5.0
Actual                  
0.0        335   67   18
1.0        176  228   48
2.0         26   26  201
3.0         28   32  259
4.0         22   26  208
5.0         61   73  506
accuracy = 0.457 
precision =0.302 
recall =0.457 
f1 =0.357


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
cm = confusion_matrix(y_test,y_pred)
cmp = ConfusionMatrixDisplay(cm,display_labels=['non-depressed','Scale 1' , 'Scale 2', 'Scale 3' , 'Scale 4' , 'Scale 5'])
cmp = cmp.plot(cmap="Blues")
plt.show()

### Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10,criterion = 'entropy',random_state = 0)
classifier.fit(X_train,y_train)

In [32]:
y_pred = classifier.predict(X_test)

In [33]:
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test,name="Actual"),pd.Series(y_pred,name="Predicted")))
acuracy,precision,recall,f1 = get_metrics(y_test,y_pred)
print("accuracy = %.3f \nprecision =%.3f \nrecall =%.3f \nf1 =%.3f" % (acuracy,precision,recall,f1))

Confusion matrix

Predicted  0.0  1.0  2.0  3.0  4.0  5.0
Actual                                 
0.0        296   80    6    5    4   29
1.0        149  200   11   15    9   68
2.0         13   32   20   32   26  130
3.0         32   26   21   41   17  182
4.0         15   37   24   37   16  127
5.0         54   83   38   90   49  326
accuracy = 0.384 
precision =0.341 
recall =0.384 
f1 =0.354


In [None]:
cm = confusion_matrix(y_test,y_pred)
cmp = ConfusionMatrixDisplay(cm,display_labels=['non-depressed','Scale 1' , 'Scale 2', 'Scale 3' , 'Scale 4' , 'Scale 5'])
cmp = cmp.plot(cmap="Blues")
plt.show()

In [None]:
parameter_range = np.arange(3, 15, 1)

train_score, test_score = validation_curve(RandomForestClassifier(criterion = 'entropy',random_state = 0), x, y,
                                       param_name = "n_estimators",
                                       param_range = parameter_range,
                                        cv = 5, scoring = "accuracy")
 
# Calculating mean and standard deviation of training score
mean_train_score = np.mean(train_score, axis = 1)
std_train_score = np.std(train_score, axis = 1)
 
# Calculating mean and standard deviation of testing score
mean_test_score = np.mean(test_score, axis = 1)
std_test_score = np.std(test_score, axis = 1)
 
# Plot mean accuracy scores for training and testing scores
plt.plot(parameter_range, mean_train_score,
     label = "Training Accuracy", color = 'b')
plt.plot(parameter_range, mean_test_score,
   label = "Validation Accuracy", color = 'g')
 
# Creating the plot
plt.title("Accuracy Curve with random_forest classifier")
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.tight_layout()
plt.legend(loc = 'best')
plt.show()

### Saving build model

In [59]:
Pkl_Filename = "BOW_Scaled_Model.pkl" 

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(classifier, file)

### Adding UI Component

#### Text UI

In [34]:
import gradio as gr

In [35]:
labels = ['non-depressed','Scale 1' , 'Scale 2', 'Scale 3' , 'Scale 4' , 'Scale 5']

def getPrediction(Text):
        Text = clean(Text)
        X = cv.transform([Text])
        prediction = classifier.predict_proba(X)
        return {labels[0]:prediction[0][0] ,labels[1]:prediction[0][1] ,labels[2]:prediction[0][2] ,labels[3]:prediction[0][3] ,labels[4]:prediction[0][4] ,labels[5]:prediction[0][5]}

        


In [36]:
getPrediction("I am very unsatisfied.")

{'non-depressed': 0.3,
 'Scale 1': 0.4,
 'Scale 2': 0.1,
 'Scale 3': 0.2,
 'Scale 4': 0.0,
 'Scale 5': 0.0}

In [None]:
iface = gr.Interface(fn=getPrediction,inputs = "text",outputs="label",title="Depression Text Classification")
iface.launch(share=True)

#### Image UI

In [38]:
import pytesseract
import PIL
import PIL.Image
import cv2

In [39]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [40]:
labels = ['non-depressed','Scale 1' , 'Scale 2', 'Scale 3' , 'Scale 4' , 'Scale 5']
my_config = r"--psm 11 --oem 3"

def getPrediction1(img):
        # print(img)
        Text = pytesseract.image_to_string(img,config=my_config)
        Text = clean(Text)
        X = cv.transform([Text])
        prediction = classifier.predict_proba(X)
        return {labels[0]:prediction[0][0] ,labels[1]:prediction[0][1] ,labels[2]:prediction[0][2] ,labels[3]:prediction[0][3] ,labels[4]:prediction[0][4] ,labels[5]:prediction[0][5]}

 

In [None]:
iface = gr.Interface(fn=getPrediction1,inputs = "image",outputs="label",title="Depression Image Classification")
iface.launch(share=True)