In [12]:
import numpy as np
import pandas as pd
import re
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv("/content/Sheet_1.csv")

# Data cleaning

In [3]:
data.head()

Unnamed: 0,response_id,class,response_text,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,response_1,not_flagged,I try and avoid this sort of conflict,,,,,
1,response_2,flagged,Had a friend open up to me about his mental ad...,,,,,
2,response_3,flagged,I saved a girl from suicide once. She was goin...,,,,,
3,response_4,not_flagged,i cant think of one really...i think i may hav...,,,,,
4,response_5,not_flagged,Only really one friend who doesn't fit into th...,,,,,


In [4]:
df = data[["response_text","class"]]

In [5]:
df.head()

Unnamed: 0,response_text,class
0,I try and avoid this sort of conflict,not_flagged
1,Had a friend open up to me about his mental ad...,flagged
2,I saved a girl from suicide once. She was goin...,flagged
3,i cant think of one really...i think i may hav...,not_flagged
4,Only really one friend who doesn't fit into th...,not_flagged


In [7]:
df["class"].unique()

array(['not_flagged', 'flagged'], dtype=object)

In [9]:
df["label"] = LabelEncoder().fit_transform(df["class"])

In [10]:
df.head()

Unnamed: 0,response_text,class,label
0,I try and avoid this sort of conflict,not_flagged,1
1,Had a friend open up to me about his mental ad...,flagged,0
2,I saved a girl from suicide once. She was goin...,flagged,0
3,i cant think of one really...i think i may hav...,not_flagged,1
4,Only really one friend who doesn't fit into th...,not_flagged,1


In [11]:
df.drop(columns=["class"],inplace=True)

In [14]:
corpus = []
lemmatizer = WordNetLemmatizer()
for text in df["response_text"]:
  text = re.sub("[^a-zA-z0-9]"," ",text)
  text = text.lower()
  text = word_tokenize(text)
  text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words("english")]

  text = " ".join(text)
  corpus.append(text)



In [15]:
corpus

['try avoid sort conflict',
 'friend open mental addiction weed taking life making depressed',
 'saved girl suicide going swallow bunch pill talked calm loving way',
 'cant think one really think may indirectly',
 'really one friend fit category therapist call spiraling anyway pretty much call time frustrated something boyfriend ask logical would fight would call crazy asks ok say please said hand remote',
 'couple year ago friend going switch school low self esteem helped overcome shit',
 'roommate going death loss gf anything get bedroom',
 'couple friend could say friend quite severe depression emotional problem helped eventually relationship started suffer result personal problem',
 'listened someone talk relationship trouble offered advice personal experience',
 'always listen comforted sister lost virgity night walked boyfriend cutting parent found threw house part simply bring supportive focus',
 'took week work packed car picked friend verge losing went camping surfing week par

# TFIDF

In [18]:
tfidf = TfidfVectorizer(binary=True,max_features=2500)
x = tfidf.fit_transform(corpus).toarray()


In [23]:
x.shape

(80, 528)

In [20]:
x[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.53740936,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [21]:
y = df["label"]

In [25]:
y


Unnamed: 0,label
0,1
1,0
2,0
3,1
4,1
...,...
75,1
76,0
77,1
78,0


# Train test split

In [26]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# model training

In [27]:
classifier = RandomForestClassifier()
classifier.fit(x_train,y_train)

In [28]:
pred = classifier.predict(x_test)

In [29]:
print(accuracy_score(y_test,pred))

0.875


In [31]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.88      1.00      0.93        14

    accuracy                           0.88        16
   macro avg       0.44      0.50      0.47        16
weighted avg       0.77      0.88      0.82        16

