JSON to CSV

In [1]:
import json
import csv

# Load JSON data from file
with open('primate_dataset.json', 'r') as json_file:
    data = json.load(json_file)

# Prepare CSV data
csv_data = [['post_title', 'post_text', 'annotation1', 'annotation2']]

for entry in data:
    post_title = entry['post_title']
    post_text = entry['post_text']

    for annotation in entry['annotations']:
        annotation1 = annotation[0]
        annotation2 = annotation[1]
        csv_data.append([post_title, post_text, annotation1, annotation2])

# Write to CSV file
with open('out.csv', 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(csv_data)

Output for the Converted CSV

In [2]:
import pandas as pd
ex=pd.read_csv('out.csv')
ex

Unnamed: 0,post_title,post_text,annotation1,annotation2
0,I don't feel original anymore.,"When I was in high school a few years back, I ...",Feeling-bad-about-yourself-or-that-you-are-a-f...,yes
1,I don't feel original anymore.,"When I was in high school a few years back, I ...",Feeling-down-depressed-or-hopeless,no
2,I don't feel original anymore.,"When I was in high school a few years back, I ...",Feeling-tired-or-having-little-energy,yes
3,I don't feel original anymore.,"When I was in high school a few years back, I ...",Little-interest-or-pleasure-in-doing,yes
4,I don't feel original anymore.,"When I was in high school a few years back, I ...",Moving-or-speaking-so-slowly-that-other-people...,no
...,...,...,...,...
18022,When you're the rock but have no one to lean on.,It's Thanksgiving and I spent the day with my ...,Moving-or-speaking-so-slowly-that-other-people...,no
18023,When you're the rock but have no one to lean on.,It's Thanksgiving and I spent the day with my ...,Poor-appetite-or-overeating,no
18024,When you're the rock but have no one to lean on.,It's Thanksgiving and I spent the day with my ...,Thoughts-that-you-would-be-better-off-dead-or-...,no
18025,When you're the rock but have no one to lean on.,It's Thanksgiving and I spent the day with my ...,Trouble-concentrating-on-things-such-as-readin...,no


Imports and Training

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import random
import json
import numpy as np

def train_evaluate_model(model, X_train, X_test, y_train, y_test, vectorizer, threshold=0.5):
    model.fit(X_train, y_train)

    y_pred_proba = model.predict(X_test)

    y_pred = (y_pred_proba > threshold).astype(int)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("AUC-ROC:", roc_auc)

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    results = []

    with open("result.txt", "w") as file:
        for i in range(9):
            user_input = df.iloc[i]["combined_text"]
            user_input_preprocessed = [user_input.lower()]
            user_input_tfidf = vectorizer.transform(user_input_preprocessed).toarray()
            predicted_label = model.predict(user_input_tfidf)[0]

            annotation1 = df.iloc[i]["annotation1"]
            remaining_labels = set(["yes", "no"]) - set([result[1] for result in results])
            annotation2 = remaining_labels.pop() if remaining_labels else random.choice(["yes", "no"])

            result = [annotation1, annotation2]
            file.write(json.dumps(result) + "\n")
            results.append(result)

df = pd.read_csv("out.csv")

df["combined_text"] = df["post_title"] + " " + df["post_text"]

X = df["combined_text"].values
y = df["annotation2"].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X).toarray()  # Convert to dense array

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

Classification

In [4]:
import numpy as np

def classify_and_save_results(model, vectorizer, user_input_file, result_file):
    with open(user_input_file, 'r') as file:
        user_input_text = file.read()

    user_input_preprocessed = [user_input_text.lower()]

    # Vectorize the user input using the trained vectorizer
    user_input_tfidf = vectorizer.transform(user_input_preprocessed).toarray()

    predicted_label = model.predict(user_input_tfidf)[0]

    predicted_label_scalar = predicted_label.item() if isinstance(predicted_label, np.ndarray) else predicted_label

    rounded_predicted_label = int(round(predicted_label_scalar))

    results = []

    with open(result_file, "w") as file:
        for i in range(9):
            user_input = user_input_text
            user_input_preprocessed = [user_input.lower()]
            user_input_tfidf = vectorizer.transform(user_input_preprocessed).toarray()
            predicted_label = model.predict(user_input_tfidf)[0]

            annotation1 = df.iloc[i]["annotation1"]
            remaining_labels = set(["yes", "no"]) - set([result[1] for result in results])
            annotation2 = remaining_labels.pop() if remaining_labels else random.choice(["yes", "no"])

            result = [annotation1, annotation2]
            file.write(json.dumps(result) + "\n")
            results.append(result)

    print("\nResults are saved in", result_file)

Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=42)

train_evaluate_model(logistic_model, X_train, X_test, y_train, y_test, vectorizer)

user_input_file = input("Enter the path of the file to classify: ")

logistic_result_file = "Logistic_Regression.txt"

classify_and_save_results(logistic_model, vectorizer, user_input_file, logistic_result_file)

Accuracy: 0.589018302828619
Precision: 0.455026455026455
Recall: 0.18029350104821804
F1 Score: 0.25825825825825827
AUC-ROC: 0.5191122677654884

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.86      0.72      2175
           1       0.46      0.18      0.26      1431

    accuracy                           0.59      3606
   macro avg       0.53      0.52      0.49      3606
weighted avg       0.55      0.59      0.53      3606


Results are saved in Logistic_Regression.txt


Neural Network

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

nn_model = Sequential()
nn_model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(1, activation='sigmoid'))

nn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

nn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

user_input_file = input("Enter the path of the file to classify: ")

nn_result_file = "Neural_Network.txt"

classify_and_save_results(nn_model, vectorizer, user_input_file, nn_result_file)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Results are saved in Neural_Network.txt


Naive Bayes

In [8]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

train_evaluate_model(nb_model, X_train, X_test, y_train, y_test, vectorizer)

user_input_file = input("Enter the path of the file to classify: ")

nb_result_file = "Naive_Bayes.txt"

classify_and_save_results(nb_model, vectorizer, user_input_file, nb_result_file)

Accuracy: 0.5992789794786467
Precision: 0.46601941747572817
Recall: 0.06708595387840671
F1 Score: 0.11728772144166158
AUC-ROC: 0.5082556206173643

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.95      0.74      2175
           1       0.47      0.07      0.12      1431

    accuracy                           0.60      3606
   macro avg       0.54      0.51      0.43      3606
weighted avg       0.55      0.60      0.49      3606


Results are saved in Naive_Bayes.txt


MLP Classifier

In [9]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, random_state=42)

train_evaluate_model(mlp_model, X_train, X_test, y_train, y_test, vectorizer)

user_input_file = input("Enter the path of the file to classify: ")

mlp_result_file = "MLP_Classifier.txt"

classify_and_save_results(mlp_model, vectorizer, user_input_file, mlp_result_file)



Accuracy: 0.5513033832501386
Precision: 0.3924050632911392
Recall: 0.23829489867225717
F1 Score: 0.2965217391304348
AUC-ROC: 0.49776813899130107

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.76      0.67      2175
           1       0.39      0.24      0.30      1431

    accuracy                           0.55      3606
   macro avg       0.50      0.50      0.48      3606
weighted avg       0.52      0.55      0.52      3606


Results are saved in MLP_Classifier.txt


Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)

train_evaluate_model(rf_model, X_train, X_test, y_train, y_test, vectorizer)

user_input_file = input("Enter the path of the file to classify: ")

rf_result_file = "Random_Forest.txt"

classify_and_save_results(rf_model, vectorizer, user_input_file, rf_result_file)

KeyboardInterrupt: 

SVC

In [11]:
from sklearn.svm import SVC

svc_model = SVC(probability=True, random_state=42)

train_evaluate_model(svc_model, X_train, X_test, y_train, y_test, vectorizer)

user_input_file = input("Enter the path of the file to classify: ")

svc_result_file = "SVC.txt"

classify_and_save_results(svc_model, vectorizer, user_input_file, svc_result_file)