# Random Forest Classification

For COSC 325 Project WellnessWatch

## Normal vs. Other (Multiclass Classification)

Multiclass classification is what we need for our model to actually be useful for our purpose

### Step 1: Importing libraries

In [28]:
import numpy as np
import pandas as pd
import nltk
import re
import contractions
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


RANDOM_STATE = 24

### Step 2: Read in lemmatized data
Lemmatization is important so that our data is standardized

In [29]:
# multiclass_categories = ['Anxiety', 'Bipolar', 'Depression', 'Normal', 'Personality disorder', 'Stress', 'Suicidal'] 

processed_data = pd.read_csv("processed_data.csv")

print(processed_data.head())

                                           statement   status
0                               ['oh', 'my', 'gosh']  Anxiety
1  ['trouble', 'sleep', 'confuse', 'mind', 'restl...  Anxiety
2  ['wrong', 'back', 'off', 'dear', 'forward', 'd...  Anxiety
3  ['i', 'have', 'shift', 'my', 'focus', 'to', 's...  Anxiety
4  ['i', 'restless', 'and', 'restless', 'it', 'be...  Anxiety


### Step 3: Train-Test Split

In [30]:
# Assign X to "statement" or tokenized equivalent 
X = processed_data["statement"]

# Assign y to "status_encoded"
y = processed_data["status"]

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2, # Proportion of the dataset to include in the test split (30% in this case)
    random_state = RANDOM_STATE, # Seed for the random number generator (for reproducibility)
    ## Stratefied sampling to ensure class balance in training and test set
    ## (This ensures class balance in the target class labels. You may ignore this for now as we will cover this later in the class)
    stratify = y
)

### Step 4: Bag-of-Words Vectorizer

In [31]:
# SKLearn Test for Bag of Words model

CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
# transform
Count_train_data = CountVec.fit_transform(X_train)
Count_test_data = CountVec.transform(X_test)


### Step 4.5: TF-IDF Vectorizer rather than Bag-of-Words

In [32]:
TFVec = TfidfVectorizer(ngram_range=(1,1), stop_words='english')

Tf_train_data = TFVec.fit_transform(X_train)
Tf_test_data = TFVec.transform(X_test)

### Step 5: Decision Tree
1. Fit data
2. Predict
3. Evaluate using accuracy_score
4. Iteratate over different values for variables such as 
    1. max_depth (e.g., 1, 3, 5, 7, 9, 11)
    2. criteria (gini vs entropy)
5. Determine the best input for parameters max_depth and criteria

In [35]:
# Train a Decision Tree model

# dt_model = DecisionTreeClassifier(random_state=42)
# dt_model.fit(Count_train_data, y_train) 
# dt_prediction = dt_model.predict(Count_test_data)
# dt_accuracy = accuracy_score(y_test, dt_prediction)
# print(dt_accuracy)


# Test multiple hyperparameters for decision tree model

# Create a dictionary for store the choices for hyperparameter values:
hyper_params_tree = {
    "max_depth": [None], # 1, 3, 5, 7, 9, 10, 11, 15, 20, 25, 30, 35, 40, 45, 50
    "criterion": ["log_loss", "gini", "entropy"]
}

# Initialize the list to store accuracy scores:
scores_tree = {
    "log_loss": [],
    "gini": [],
    "entropy": []
}

for criterion in hyper_params_tree["criterion"]:
    accuracies = []                    # Initialize a placeholder for results of current criterion
    for depth in hyper_params_tree["max_depth"]:
        tree_model = DecisionTreeClassifier(criterion=criterion, max_depth=depth, random_state=RANDOM_STATE)   # Define tree model based on current hyperparameter combination
        tree_model.fit(Tf_train_data, y_train) # Fit the tree
        y_pred = tree_model.predict(Tf_test_data) # Predict using the tree

        accuracies.append(accuracy_score(y_test, y_pred)) # Calculate accuracy

    # Collect accuracy score results for this configuration of hyperparameters:
    scores_tree[criterion] = accuracies
    
### END SOLUTION

scores_tree

{'log_loss': [0.6338616304450982],
 'gini': [0.6446806491411218],
 'entropy': [0.6338616304450982]}

### Step 6: Random Forest Classifier
1. Fit data
2. Predict
3. Evaluate using accuracy_score
4. Iteratate over different values for variables such as 
    1. The number of trees 
    2. max_depth (e.g., 1, 3, 5, 7, 9, 11)
    3. criteria (gini vs entropy)
5. Create a final model 

In [25]:
# Train a Random Forest model

rf_model = RandomForestClassifier(criterion="gini", max_depth=50, random_state=RANDOM_STATE)

rf_model.fit(Tf_train_data, y_train) 

rf_prediction = rf_model.predict(Tf_test_data)

rf_accuracy = accuracy_score(y_test, rf_prediction)
print(rf_accuracy)



0.6171585840372023


In [26]:
# Train a Decision Tree model
# This is what was used to test "Time Taken"

dt_model = DecisionTreeClassifier(criterion="entropy", max_depth=45, random_state=42)

dt_model.fit(Tf_train_data, y_train) 

dt_prediction =dt_model.predict(Tf_test_data)

dt_accuracy = accuracy_score(y_test, dt_prediction)
print(dt_accuracy)



0.6350004745183638


### TODO

3. Get finished data so we can analyze
4. max_features hyperparameter (can be pushed to post-midterm report)  
5. upload to github!
6. Look into Word Embeddings? (post-midterm report)
7. Look into other models (post-midterm report)