# Random Forest Classification

For COSC 325 Project WellnessWatch

## Normal vs. Other (Binary Classification)

### Step 1: Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
import contractions
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


RANDOM_STATE = 24

### Step 2: Read in and filter the data

In [2]:
url = "https://raw.githubusercontent.com/The-Girlies/Wellness-Watch/main/kaggle_sentiment_data.csv"
data = pd.read_csv(url)

# Remove the first column
data = data.drop(data.columns[0], axis=1)
data = data.dropna(subset=['statement', 'status'])

print(data.head())

                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleeping, confused mind, restless hear...  Anxiety
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3  I've shifted my focus to something else but I'...  Anxiety
4  I'm restless and restless, it's been a month n...  Anxiety


### Step 3: Quantify data 
1. Number the categories in "status"
2. Clean the text in "statement", removing punctuation, stop words, and converting text to lowercase
3. Tokenize the text in "statement" based on words

In [3]:
processed_data = data[["statement", "status"]]

# Assign the following for "status_encoded" column:
# 0 - Normal
# 1 - Other
# categories = ["Normal", "Other"]  # multiclass_categories = ['Anxiety', 'Bipolar', 'Depression', 'Normal', 'Personality disorder', 'Stress', 'Suicidal'] 
processed_data = processed_data.assign(status_encoded = [0 if status == "Normal" else 1 for status in processed_data["status"]])

print(processed_data)

                                               statement   status  \
0                                             oh my gosh  Anxiety   
1      trouble sleeping, confused mind, restless hear...  Anxiety   
2      All wrong, back off dear, forward doubt. Stay ...  Anxiety   
3      I've shifted my focus to something else but I'...  Anxiety   
4      I'm restless and restless, it's been a month n...  Anxiety   
...                                                  ...      ...   
53038  Nobody takes me seriously I’ve (24M) dealt wit...  Anxiety   
53039  selfishness  "I don't feel very good, it's lik...  Anxiety   
53040  Is there any way to sleep better? I can't slee...  Anxiety   
53041  Public speaking tips? Hi, all. I have to give ...  Anxiety   
53042  I have really bad door anxiety! It's not about...  Anxiety   

       status_encoded  
0                   1  
1                   1  
2                   1  
3                   1  
4                   1  
...               ...  
530

In [4]:
# my processing actually kinda mucks things up... spaces get removed and you end up with "words" like 'goodjealousanxiousdisappointedfeeling'
# test1 and test3 have removing punctuation except apostrophes and removing double spaces 
# test2 had the tokenization all commented out   (control... no filtering)
# test4 commented out the re.sub lines removing punctuation except apostrophes and removing double spaces 
# test5 commented out the removing punctuation except apostrophes, leaving only the removing double spaces 
# test2 and test4 and test5 had the exact same results

# therefore, double spaces do not affect the results
# need to compare removing all punctuation except and including apostrophes with each other and with control 
# test6 includes removes all punctuation, and has the one making an exception for apostrophes commented out
# test7 fixes contractions but also removes all punctuation except apostrophes and removes double spaces
# test7 produces the same results as test2 and therefore test4 and test5



# Clean the text in "statement", removing punctuation, stop words, and converting text to lowercase
# Tokenize the text in "statement", splitting the text into individual words
for s in range(len(processed_data["statement"])):
    string = processed_data.iloc[s]["statement"]
    string = contractions.fix(string)
    string = string.lower()
    # string = re.sub(r"[^\w\s]", '', string) # removes all punctuation 
    string = re.sub(r"[^\w\s']", '', string) # removes all punctuation except apostrophes
    string = re.sub(r'\s+', ' ', string) # removes double spaces
    tokenized_string = nltk.word_tokenize(string)

    processed_data.iloc[s]["statement"] = tokenized_string
    # processed_data.iloc[s]["statement"] = string
print(processed_data)

# # Word Counter in case you want to sort most commonly used words
# word2count = {} 
# for data in processed_data["statement"]: 
#     words = nltk.word_tokenize(data) 
#     for word in words: 
#         if word not in word2count.keys(): 
#             word2count[word] = 1
#         else: 
#             word2count[word] += 1

# print(word2count)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data.iloc[s]["statement"] = tokenized_string


                                               statement   status  \
0                                             oh my gosh  Anxiety   
1      trouble sleeping, confused mind, restless hear...  Anxiety   
2      All wrong, back off dear, forward doubt. Stay ...  Anxiety   
3      I've shifted my focus to something else but I'...  Anxiety   
4      I'm restless and restless, it's been a month n...  Anxiety   
...                                                  ...      ...   
53038  Nobody takes me seriously I’ve (24M) dealt wit...  Anxiety   
53039  selfishness  "I don't feel very good, it's lik...  Anxiety   
53040  Is there any way to sleep better? I can't slee...  Anxiety   
53041  Public speaking tips? Hi, all. I have to give ...  Anxiety   
53042  I have really bad door anxiety! It's not about...  Anxiety   

       status_encoded  
0                   1  
1                   1  
2                   1  
3                   1  
4                   1  
...               ...  
530

### Step 5: Train-Test Split

In [5]:
# Assign X to "statement" or tokenized equivalent 
X = processed_data["statement"]

# Assign y to "status_encoded"
y = processed_data["status_encoded"]

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2, # Proportion of the dataset to include in the test split (30% in this case)
    random_state = RANDOM_STATE, # Seed for the random number generator (for reproducibility)
    ## Stratefied sampling to ensure class balance in training and test set
    ## (This ensures class balance in the target class labels. You may ignore this for now as we will cover this later in the class)
    stratify = y
)

### Step 5: Bag-of-Words Vectorizer

In [6]:
# vectorizer = CountVectorizer(max_features=1000)  # Limit vocabulary size

# X_train_bow = vectorizer.fit_transform(train_text)  # Transform training data

# SKLearn Test for Bag of Words model

CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
# transform
Count_train_data = CountVec.fit_transform(X_train)
Count_test_data = CountVec.transform(X_test)

# feat_dict = CountVec.vocabulary_.keys()
# print(feat_dict)

### Step 5: Decision Tree
1. Fit data
2. Predict
3. Evaluate using accuracy_score
4. Iteratate over different values for variables such as 
    1. max_depth (e.g., 1, 3, 5, 7, 9, 11)
    2. criteria (gini vs entropy vs log_loss)
5. Determine the best input for parameters max_depth and criteria

In [7]:
# Train a Decision Tree model

# dt_model = DecisionTreeClassifier(random_state=42)
# dt_model.fit(Count_train_data, y_train) 
# dt_prediction = dt_model.predict(Count_test_data)
# dt_accuracy = accuracy_score(y_test, dt_prediction)
# print(dt_accuracy)


# Test multiple hyperparameters for decision tree model

# Create a dictionary for store the choices for hyperparameter values:
hyper_params_tree = {
    "max_depth": [50, 55, 60, 65, 70],
    "criterion": ["log_loss"]
}

# Initialize the list to store accuracy scores:
scores_tree = {
    "log_loss": []
}

for criterion in hyper_params_tree["criterion"]:
    accuracies = []                    # Initialize a placeholder for results of current criterion
    for depth in hyper_params_tree["max_depth"]:
        tree_model = DecisionTreeClassifier(criterion=criterion, max_depth=depth, random_state=RANDOM_STATE)   # Define tree model based on current hyperparameter combination
        tree_model.fit(Count_train_data, y_train) # Fit the tree
        y_pred = tree_model.predict(Count_test_data) # Predict using the tree

        accuracies.append(accuracy_score(y_test, y_pred)) # Calculate accuracy

    # Collect accuracy score results for this configuration of hyperparameters:
    scores_tree[criterion] = accuracies
    
### END SOLUTION

scores_tree

{'log_loss': [0.9085128594476606,
  0.9078485337382557,
  0.9069944006833064,
  0.9113599696308247,
  0.9111701622852805]}

### Step 6: Random Forest Classifier
1. Fit data
2. Predict
3. Evaluate using accuracy_score
4. Iteratate over different values for variables such as 
    1. The number of trees 
    2. max_depth (e.g., 1, 3, 5, 7, 9, 11)
    3. criteria (gini vs entropy)
5. Create a final model 

In [8]:
# Train a Random Forest model

rf_model = RandomForestClassifier(criterion="log_loss", max_depth=65, random_state=RANDOM_STATE)

rf_model.fit(Count_train_data, y_train) 

rf_prediction = rf_model.predict(Count_test_data)

rf_accuracy = accuracy_score(y_test, rf_prediction)
print(rf_accuracy)



0.9212299515991269


### TODO
1. Do Random Forest + Decision Tree on Kaylee's lemmatized data! Important so that data is standardized
2. Figure out MULTI-CLASS CLASSIFICATION! This is what we need for our model to actually be useful for our purpose
3. max_features hyperparameter (can be pushed to post-midterm report) 
4. upload to github! 
5. Get finished data so we can analyze



1. Use TF-IDF instead of CountVectorizer
2. Look into Word Embeddings
3. 