Firstly we need to extract our reddit data that contains the comments and usernames from the database

In [3]:
# Importing necessary libraries
import psycopg2
import pandas as pd
import numpy as np

In [4]:
# Created a Connection string to query the reddit comments from database
connection_string = "postgresql://niphemi.oyewole:W7bHIgaN1ejh@ep-delicate-river-a5cq94ee-pooler.us-east-2.aws.neon.tech/Vetassist"

try:
    # Connect to the PostgreSQL database that contains the reddit usernames and comments
    conn = psycopg2.connect(connection_string)

    # Create a cursor object to execute SQL queries
    cur = conn.cursor()

    # Execute a SELECT query to fetch rows from the table
    cur.execute("SELECT * FROM reddit_usernames_comments")

    # Fetch all rows from the result set
    rows = cur.fetchall()

    # Get column names from the cursor description
    colnames = [desc[0] for desc in cur.description]

    # Close the cursor and connection
    cur.close()
    conn.close()

    # Create a pandas DataFrame from the fetched rows
    df = pd.DataFrame(rows, columns=colnames)
    df = df[["username","comments"]]

    # Display the DataFrame
    print(df)
except psycopg2.Error as e:
    print("Error connecting to PostgreSQL database:", e)

                  username                                           comments
0                   KR1735  Yeah this is why I am so glad I picked IM and ...
1               Persiandoc                                          Paywall ?
2            a_neurologist          I wonder if this figure counts home call.
3      DrTedPenisAstronaut  Don’t dwell on the mistakes, but learn from th...
4              Big-Fly6100  You are going to make mistakes. Just learn fro...
...                    ...                                                ...
6164            Okaythenn7  I’m a 12th grader in Hungary but i want to go ...
6165           daliadeimos  Not a vet, but a tech. I’ve been avoiding any ...
6166  Unhappy_Passenger_86  As some one who is also coming from a difficul...
6167           Daktari2018  Good for you for sticking to standards of care...
6168         Real_Use_3216  It’s no different than undergrad. School is sc...

[6169 rows x 2 columns]


Created a function that classify the comments categories that satisfy these condition :

1. Medical Doctor

2. Veterinarian

3. Other



In [5]:
# created the function to label each comment
def encode_label(label):
    if "doctor" in label.lower() or "clinics" in label.lower() or "consultant" in label.lower() or "practicing" in label.lower():
        return "Medical Doctor"
    elif "veterinarian" in label.lower() or "vets" in label.lower() or "vet consultants"in label.lower() or "vet clinic" in label.lower():
        return "Veterinarian"
    else:
        return "Other"

df['label'] = df['comments'].apply(encode_label)

In [6]:
# Save a portion of the data into csv for our modeling
df.to_csv("reddit_commentz.csv", index=False)

In [7]:
#opening our saved csv file
df = pd.read_csv("reddit_commentz.csv")

In [8]:
df.iloc[3275]

username                                 Aggravating_Slip_566
comments    Same thing with the hair industry! I absolutel...
label                                                   Other
Name: 3275, dtype: object

In [9]:
#dropping null columns
df.dropna()

Unnamed: 0,username,comments,label
0,KR1735,Yeah this is why I am so glad I picked IM and ...,Other
1,Persiandoc,Paywall ?,Other
2,a_neurologist,I wonder if this figure counts home call.,Other
3,DrTedPenisAstronaut,"Don’t dwell on the mistakes, but learn from th...",Other
4,Big-Fly6100,You are going to make mistakes. Just learn fro...,Other
...,...,...,...
6164,Okaythenn7,I’m a 12th grader in Hungary but i want to go ...,Other
6165,daliadeimos,"Not a vet, but a tech. I’ve been avoiding any ...",Other
6166,Unhappy_Passenger_86,As some one who is also coming from a difficul...,Other
6167,Daktari2018,Good for you for sticking to standards of care...,Medical Doctor


In [10]:
# accessing the number of our categories
df.label.value_counts()

label
Other             4741
Medical Doctor    1107
Veterinarian       321
Name: count, dtype: int64

In [11]:
df.iloc[234]

username                                           Thrbt52017
comments    Your family’s wish was to live longer. That is...
label                                                   Other
Name: 234, dtype: object

Importing the required Python libraries for machine learning, of which I will be utilizing both in this case.

1: Convolutional Neural Network, since it is capable of learning representations that are useful for categorization and identifying patterns in text. 

2: Support Vector Networks, due to its ease of interpretation and general expertise in classification tasks.


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [13]:

# SVM Feature Engineering 
tfidf_vectorizer = TfidfVectorizer()
X_svm = tfidf_vectorizer.fit_transform(df['comments'])
y_svm = df['label']


In [14]:
# CNN Feature Engineering

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['comments'])
X_cnn = tokenizer.texts_to_sequences(df['comments'])
X_cnn = pad_sequences(X_cnn, maxlen=100)
y_cnn = df['label']

In [15]:

# Split my data into train and test sets for model

X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_svm, y_svm, test_size=0.2, random_state=42)
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_cnn, y_cnn, test_size=0.2, random_state=42)


In [16]:
##Encode my  y labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_cnn_encoded = label_encoder.fit_transform(y_train_cnn)
y_test_cnn_encoded = label_encoder.transform(y_test_cnn)
y_test_cnn_encoded.dtype

dtype('int32')

In [17]:

## SVM MODEL TRAINED

svm_model = SVC(kernel='linear')
svm_model.fit(X_train_svm, y_train_svm)

In [18]:
## Train CNN model

cnn_model = Sequential()
cnn_model.add(Embedding(5000, 100, input_length=100))
cnn_model.add(Conv1D(64, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(10, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(3, activation='softmax'))
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_cnn, y_train_cnn_encoded, epochs=5, batch_size=64, validation_data=(X_test_cnn, y_test_cnn_encoded))


Epoch 1/5




[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.6411 - loss: 0.8779 - val_accuracy: 0.7723 - val_loss: 0.5378
Epoch 2/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - accuracy: 0.7824 - loss: 0.5912 - val_accuracy: 0.8995 - val_loss: 0.3078
Epoch 3/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - accuracy: 0.8540 - loss: 0.3889 - val_accuracy: 0.9303 - val_loss: 0.2159
Epoch 4/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.8814 - loss: 0.3084 - val_accuracy: 0.9254 - val_loss: 0.2126
Epoch 5/5
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.8940 - loss: 0.2573 - val_accuracy: 0.9100 - val_loss: 0.2219


<keras.src.callbacks.history.History at 0x295c597ffd0>

In [19]:

##Evaluate models

svm_accuracy = accuracy_score(y_test_svm, svm_model.predict(X_test_svm))
svm_accuracy

0.9254457050243112

In [20]:
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_cnn, y_test_cnn_encoded)

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9129 - loss: 0.2102


In [21]:
# Predict labels for the entire dataset using the best model
predicted_labels = svm_model.predict(X_svm)
predicted_labels

array(['Other', 'Other', 'Other', ..., 'Other', 'Medical Doctor',
       'Medical Doctor'], dtype=object)

In [22]:
predicted = cnn_model.predict(X_cnn)
predicted

[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


array([[3.2438543e-03, 9.9605513e-01, 7.0097303e-04],
       [3.7493482e-03, 9.9530303e-01, 9.4763300e-04],
       [3.0267304e-03, 9.9618798e-01, 7.8534742e-04],
       ...,
       [6.5241717e-02, 8.9738905e-01, 3.7369218e-02],
       [5.4322511e-01, 1.7866243e-01, 2.7811247e-01],
       [9.5826274e-01, 8.0997692e-03, 3.3637516e-02]], dtype=float32)

CNN seems to be slightly more accurate 

In [23]:
predicted_l = np.argmax(predicted, axis=1)
predicted_classes = label_encoder.inverse_transform(predicted_l)
predicted_classes

array(['Other', 'Other', 'Other', ..., 'Other', 'Medical Doctor',
       'Medical Doctor'], dtype=object)

In [24]:


# Save results to CSV
results_df = pd.DataFrame({'username': df['username'], 'comment': df['comments'], 'predicted_label': predicted_classes})



In [25]:
results_df.to_csv('classified_comments.csv', index=False)

In [26]:
results_df

Unnamed: 0,username,comment,predicted_label
0,KR1735,Yeah this is why I am so glad I picked IM and ...,Other
1,Persiandoc,Paywall ?,Other
2,a_neurologist,I wonder if this figure counts home call.,Other
3,DrTedPenisAstronaut,"Don’t dwell on the mistakes, but learn from th...",Other
4,Big-Fly6100,You are going to make mistakes. Just learn fro...,Other
...,...,...,...
6164,Okaythenn7,I’m a 12th grader in Hungary but i want to go ...,Other
6165,daliadeimos,"Not a vet, but a tech. I’ve been avoiding any ...",Other
6166,Unhappy_Passenger_86,As some one who is also coming from a difficul...,Other
6167,Daktari2018,Good for you for sticking to standards of care...,Medical Doctor


In [29]:
results_df.iloc[6167].comment

'Good for you for sticking to standards of care and caring enough to speak to management about the issue. Obviously if they weren’t going to change any thing, it would not be a place you’d tolerate continued association.\n\nRVT esp at your level and your ethics are in high demand. I expect you’ll have plenty of choices for your next employment|Good for you for sticking to standards of care and caring enough to speak to management about the issue. Obviously if they weren’t going to change any thing, it would not be a place you’d tolerate continued association.\n\nRVT esp at your level and your ethics are in high demand. I expect you’ll have plenty of choices for your next employment|This is wonderful. Wanting to know more. Knowing there is more to learn is what drives vets to become better vets and stay excited for decades.\n\nAsk questions. You’re entering clinics so take what you’re seeing  home and see where it lines up with what you were taught. Inevitably there will be differences,