In [1]:
import requests
import json
import pandas as pd
import random

def get_posts(subreddit, before, after, size):
    url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size={size}&before={before}&after={after}"
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

# Scrape 2000 legal question titles
legal_questions = []
size = 100
after = "7d" # posts from the last 7 days
before = "" # no upper time limit
subreddit = "legaladvice"
while len(legal_questions) < 2000: # scrape until we have 2000 legal questions
    data = get_posts(subreddit, before, after, size)
    if not data: # if data is empty, break out of loop
        break
    for post in data:
        if post["title"][-1] == "?" and len(legal_questions) < 2000:
            legal_questions.append({"title": post["title"], "label": 1})
        if len(legal_questions) >= 2000: # break out of loop if we have 2000 legal questions
            break
    before = data[-1]["created_utc"] # set before time for next page

# Scrape 2000 non-legal question titles
non_legal_questions = []
size = 100
after = "7d" # posts from the last 7 days
before = "" # no upper time limit
subreddit = "askreddit"
keywords = ["legal", "law", "court"]
while len(non_legal_questions) < 2000: # scrape until we have 2000 non-legal questions
    data = get_posts(subreddit, before, after, size)
    if not data: # if data is empty, break out of loop
        break
    for post in data:
        title = post["title"]
        if not any(keyword in title.lower() for keyword in keywords) and title[-1] == "?" and len(non_legal_questions) < 2000:
            non_legal_questions.append({"title": title, "label": 0})
            if len(non_legal_questions) >= 2000: # break out of loop if we have 2000 non-legal questions
                break
    before = data[-1]["created_utc"] # set before time for next page

# Combine legal and non-legal questions into a single list and shuffle the order
questions = legal_questions + non_legal_questions
random.shuffle(questions)

# Convert list of questions to a dataframe and save to a CSV file
df = pd.DataFrame(questions)
df.to_csv("questions.csv", index=False)


In [2]:
import pandas as pd
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

# load NLTK stopwords
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

# function to classify legal questions based on serenity
def classify_serenity(question):
    # tokenize the question
    tokens = word_tokenize(question)
    # remove stopwords
    tokens = [token.lower() for token in tokens if token.lower() not in stopwords]
    # check if question contains keywords indicating a high-serenity issue
    if any(keyword in tokens for keyword in ["emergency", "urgent", "serious", "severe", "imminent", "dangerous"]):
        return 4  # high-serenity
    # check if question contains keywords indicating a medium-serenity issue
    elif any(keyword in tokens for keyword in ["important", "concerned", "need", "worried", "issue"]):
        return 3  # medium-serenity
    # check if question contains keywords indicating a low-serenity issue
    elif any(keyword in tokens for keyword in ["minor", "trivial", "small", "insignificant", "nuisance"]):
        return 2  # low-serenity
    # default: 0 serenity
    else:
        return 1

# read questions from CSV file
df = pd.read_csv("questions.csv")

# classify legal questions based on serenity
df.loc[:, 'serenity'] = df['title'].apply(classify_serenity)

# set serenity value to 0 for non-legal questions
df.loc[df['label'] == 0, 'serenity'] = 0

# drop label column
df = df.drop(columns=['label'])

# save classified questions to CSV file
df.to_csv("classified_questions.csv", index=False)

print("Saved", len(df), "classified questions to classified_questions.csv.")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/syedezazshah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/syedezazshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saved 3149 classified questions to classified_questions.csv.


In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

# load data from legal_questions.csv
df = pd.read_csv("classified_questions.csv")

# prepare data for training
X = df['title'].values
y = df['serenity'].values

# vectorize the text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# train a decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X, y)

# predict the serenity of a new question
new_question = input("Enter your legal question: ")
new_question_vec = vectorizer.transform([new_question])
predicted_serenity = clf.predict(new_question_vec)[0]
print("Predicted serenity:", predicted_serenity)


Enter your legal question: I have a legal issue with my landlord?
Predicted serenity: 3
