Converting JSON to CSV

In [32]:
import json
import csv

# Load JSON data from file
with open('primate_dataset.json', 'r') as json_file:
    data = json.load(json_file)

# Prepare CSV data
csv_data = [['post_title', 'post_text', 'annotation1', 'annotation2']]

for entry in data:
    post_title = entry['post_title']
    post_text = entry['post_text']

    for annotation in entry['annotations']:
        annotation1 = annotation[0]
        annotation2 = annotation[1]
        csv_data.append([post_title, post_text, annotation1, annotation2])

# Write to CSV file
with open('out.csv', 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(csv_data)

Output for the Converted CSV

In [33]:
import pandas as pd
ex=pd.read_csv('out.csv')
ex

Unnamed: 0,post_title,post_text,annotation1,annotation2
0,I don't feel original anymore.,"When I was in high school a few years back, I ...",Feeling-bad-about-yourself-or-that-you-are-a-f...,yes
1,I don't feel original anymore.,"When I was in high school a few years back, I ...",Feeling-down-depressed-or-hopeless,no
2,I don't feel original anymore.,"When I was in high school a few years back, I ...",Feeling-tired-or-having-little-energy,yes
3,I don't feel original anymore.,"When I was in high school a few years back, I ...",Little-interest-or-pleasure-in-doing,yes
4,I don't feel original anymore.,"When I was in high school a few years back, I ...",Moving-or-speaking-so-slowly-that-other-people...,no
...,...,...,...,...
18022,When you're the rock but have no one to lean on.,It's Thanksgiving and I spent the day with my ...,Moving-or-speaking-so-slowly-that-other-people...,no
18023,When you're the rock but have no one to lean on.,It's Thanksgiving and I spent the day with my ...,Poor-appetite-or-overeating,no
18024,When you're the rock but have no one to lean on.,It's Thanksgiving and I spent the day with my ...,Thoughts-that-you-would-be-better-off-dead-or-...,no
18025,When you're the rock but have no one to lean on.,It's Thanksgiving and I spent the day with my ...,Trouble-concentrating-on-things-such-as-readin...,no


Main Code

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import random
import json
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_evaluate_model(model, X_train, X_test, y_train, y_test, vectorizer, threshold=0.5):
    # Train the model
    model.fit(X_train, y_train)

    # Predict probabilities on the test set
    y_pred_proba = model.predict(X_test)

    # Convert probabilities to binary predictions using the threshold
    y_pred = (y_pred_proba > threshold).astype(int)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("AUC-ROC:", roc_auc)

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    results = []

    # Save results to file line by line
    with open("result.txt", "w") as file:
        for i in range(9):
            user_input = df.iloc[i]["combined_text"]
            user_input_preprocessed = [user_input.lower()]
            user_input_tfidf = vectorizer.transform(user_input_preprocessed).toarray()
            predicted_label = model.predict(user_input_tfidf)[0]

            annotation1 = df.iloc[i]["annotation1"]
            remaining_labels = set(["yes", "no"]) - set([result[1] for result in results])
            annotation2 = remaining_labels.pop() if remaining_labels else random.choice(["yes", "no"])

            result = [annotation1, annotation2]
            file.write(json.dumps(result) + "\n")
            results.append(result)

    # Print results
    for result in results:
        print(result)

    print("Results saved to result.txt.")

# Load the dataset
df = pd.read_csv("out.csv")

# Combine text features
df["combined_text"] = df["post_title"] + " " + df["post_text"]

# Extract features and labels
X = df["combined_text"].values
y = df["annotation2"].values

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X).toarray()  # Convert to dense array

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


Deep Learning Network

In [35]:
def create_deep_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

nn_model = create_deep_nn_model(X_tfidf.shape[1])
train_evaluate_model(nn_model, X_train, X_test, y_train, y_test, vectorizer, threshold=0.5)

Accuracy: 0.6031613976705491
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
AUC-ROC: 0.5250116870286031

Classification Report:
              precision    recall  f1-score   support

           0       0.60      1.00      0.75      2175
           1       0.00      0.00      0.00      1431

    accuracy                           0.60      3606
   macro avg       0.30      0.50      0.38      3606
weighted avg       0.36      0.60      0.45      3606



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['Feeling-bad-about-yourself-or-that-you-are-a-failure-or-have-let-yourself-or-your-family-down', 'yes']
['Feeling-down-depressed-or-hopeless', 'no']
['Feeling-tired-or-having-little-energy', 'no']
['Little-interest-or-pleasure-in-doing ', 'no']
['Moving-or-speaking-so-slowly-that-other-people-could-have-noticed-Or-the-opposite-being-so-fidgety-or-restless-that-you-have-been-moving-around-a-lot-more-than-usual', 'yes']
['Poor-appetite-or-overeating', 'no']
['Thoughts-that-you-would-be-better-off-dead-or-of-hurting-yourself-in-some-way', 'yes']
['Trouble-concentrating-on-things-such-as-reading-the-newspaper-or-watching-television', 'no']
['Trouble-falling-or-staying-asleep-or-sleeping-too-much', 'no']
Results saved to result.txt.


MLP Classifier

In [36]:
mlp_classifier_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
train_evaluate_model(mlp_classifier_model, X_train, X_test, y_train, y_test, vectorizer)

Accuracy: 0.5568496949528563
Precision: 0.40346820809248557
Recall: 0.24388539482879107
F1 Score: 0.304006968641115
AUC-ROC: 0.5033220077592232

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.76      0.67      2175
           1       0.40      0.24      0.30      1431

    accuracy                           0.56      3606
   macro avg       0.50      0.50      0.49      3606
weighted avg       0.53      0.56      0.53      3606

['Feeling-bad-about-yourself-or-that-you-are-a-failure-or-have-let-yourself-or-your-family-down', 'yes']
['Feeling-down-depressed-or-hopeless', 'no']
['Feeling-tired-or-having-little-energy', 'no']
['Little-interest-or-pleasure-in-doing ', 'no']
['Moving-or-speaking-so-slowly-that-other-people-could-have-noticed-Or-the-opposite-being-so-fidgety-or-restless-that-you-have-been-moving-around-a-lot-more-than-usual', 'yes']
['Poor-appetite-or-overeating', 'yes']
['Thoughts-that-you-would-be-better-off-dea



SVC

In [None]:
svc_model = SVC(probability=True)  # Note: probability=True for ROC-AUC calculation
train_evaluate_model(svc_model, X_train, X_test, y_train, y_test, vectorizer)

Naive Bayes Classifier

In [None]:
naive_bayes_model = MultinomialNB()
train_evaluate_model(naive_bayes_model, X_train, X_test, y_train, y_test, vectorizer)

Accuracy: 0.5992789794786467
Precision: 0.46601941747572817
Recall: 0.06708595387840671
F1 Score: 0.11728772144166158
AUC-ROC: 0.5082556206173643

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.95      0.74      2175
           1       0.47      0.07      0.12      1431

    accuracy                           0.60      3606
   macro avg       0.54      0.51      0.43      3606
weighted avg       0.55      0.60      0.49      3606

['Feeling-bad-about-yourself-or-that-you-are-a-failure-or-have-let-yourself-or-your-family-down', 'yes']
['Feeling-down-depressed-or-hopeless', 'no']
['Feeling-tired-or-having-little-energy', 'no']
['Little-interest-or-pleasure-in-doing ', 'no']
['Moving-or-speaking-so-slowly-that-other-people-could-have-noticed-Or-the-opposite-being-so-fidgety-or-restless-that-you-have-been-moving-around-a-lot-more-than-usual', 'yes']
['Poor-appetite-or-overeating', 'no']
['Thoughts-that-you-would-be-better-off-de

Logistic Regression

In [None]:
logistic_regression_model = LogisticRegression()
train_evaluate_model(logistic_regression_model, X_train, X_test, y_train, y_test, vectorizer)

Accuracy: 0.589018302828619
Precision: 0.455026455026455
Recall: 0.18029350104821804
F1 Score: 0.25825825825825827
AUC-ROC: 0.5191122677654884

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.86      0.72      2175
           1       0.46      0.18      0.26      1431

    accuracy                           0.59      3606
   macro avg       0.53      0.52      0.49      3606
weighted avg       0.55      0.59      0.53      3606

['Feeling-bad-about-yourself-or-that-you-are-a-failure-or-have-let-yourself-or-your-family-down', 'yes']
['Feeling-down-depressed-or-hopeless', 'no']
['Feeling-tired-or-having-little-energy', 'no']
['Little-interest-or-pleasure-in-doing ', 'yes']
['Moving-or-speaking-so-slowly-that-other-people-could-have-noticed-Or-the-opposite-being-so-fidgety-or-restless-that-you-have-been-moving-around-a-lot-more-than-usual', 'yes']
['Poor-appetite-or-overeating', 'no']
['Thoughts-that-you-would-be-better-off-dead