In [None]:
import json
import re
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC

count_samples = 0
topics = []
excerpts = []

def extract_words(text):
    return re.findall(r'(?:[a-zA-Z]+[a-zA-Z\'\-]?[a-zA-Z]|[a-zA-Z]+)', text)

with open('td2.json', 'r') as file:
    for line in file:
        count_samples += 1
        if count_samples == 1:
            continue
        try:
            post = json.loads(line)
            topics.append(post["topic"])
            concatenated_text = post["question"] + "\r\n" + post["excerpt"]
            cleaned_sentence = "".join(word for word in extract_words(concatenated_text))
            excerpts.append(concatenated_text)
        except json.JSONDecodeError:
            print(f"Skipping line {count_samples} due to JSONDecodeError")

x_training = np.array(excerpts)
y_training = topics

print(f"Number of training samples: {len(x_training)}")

text_classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC())
])

try:
    text_classifier.fit(x_training, y_training)
except ValueError as e:
    print(f"ValueError during fitting: {e}")

try:
    num_test_cases = int(input("Enter number of test cases: "))
except ValueError:
    print("Invalid input. Expected an integer.")

test_data = []
for i in range(num_test_cases):
    try:
        json_data = json.loads(input(f"Enter JSON data for test case {i + 1}: "))
        test_data.append(json_data['question'] + "\r\n" + json_data['excerpt'])
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON data for test case {i + 1}: {e}")

if test_data:
    predicted_labels = text_classifier.predict(test_data)
    for label in predicted_labels:
        print(label)
