In [9]:
import requests
import json
import pandas as pd

def get_posts(subreddit, before, after, size):
    url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size={size}&before={before}&after={after}"
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

# Scrape 1000 legal question titles
legal_questions = []
size = 100
after = "7d" # posts from the last 7 days
before = "" # no upper time limit
subreddit = "legaladvice"
while len(legal_questions) < 1000: # scrape until we have 1000 legal questions
    data = get_posts(subreddit, before, after, size)
    if not data: # if data is empty, break out of loop
        break
    for post in data:
        legal_questions.append({"title": post["title"], "label": 1})
        if len(legal_questions) >= 1000: # break out of loop if we have 1000 legal questions
            break
    before = data[-1]["created_utc"] # set before time for next page

# Scrape 1000 non-legal question titles
non_legal_questions = []
size = 100
after = "7d" # posts from the last 7 days
before = "" # no upper time limit
subreddit = "askreddit"
while len(non_legal_questions) < 1000: # scrape until we have 1000 non-legal questions
    data = get_posts(subreddit, before, after, size)
    if not data: # if data is empty, break out of loop
        break
    for post in data:
        title = post["title"]
        if "legal" not in title.lower() and "law" not in title.lower() and "court" not in title.lower():
            non_legal_questions.append({"title": title, "label": 0})
            if len(non_legal_questions) >= 1000: # break out of loop if we have 1000 non-legal questions
                break
    before = data[-1]["created_utc"] # set before time for next page

# Combine legal and non-legal questions into a single list and shuffle the order
questions = legal_questions + non_legal_questions
import random
random.shuffle(questions)

# Convert list of questions to a dataframe and save to a CSV file
df = pd.DataFrame(questions)
df.to_csv("questions.csv", index=False)


In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# load questions data from CSV file
df = pd.read_csv('questions.csv')

# split data into training and testing sets
train_df = df[:1600]
test_df = df[1600:]

# vectorize training data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['title'])
y_train = train_df['label']

# train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# function to classify user input
def classify_input(input_str):
    X = vectorizer.transform([input_str])
    y_pred = model.predict(X)
    return y_pred[0]

# example usage
input_str = input('Enter a question: ')
label = classify_input(input_str)
if label == 1:
    print('This is a legal question.')
else:
    print('This is not a legal question.')


Enter a question: Imran khan is a hero
This is not a legal question.


In [19]:
from sklearn.linear_model import LogisticRegression

# load questions data from CSV file
df = pd.read_csv('questions.csv')

# split data into training and testing sets
train_df = df[:1600]
test_df = df[1600:]

# vectorize training data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['title'])
y_train = train_df['label']

# train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# function to classify user input
def classify_input(input_str):
    X = vectorizer.transform([input_str])
    y_pred = model.predict(X)
    return y_pred[0]

# example usage
input_str = input('Enter a question: ')
label = classify_input(input_str)
if label == 1:
    print('This is a legal question.')
else:
    print('This is not a legal question.')


Enter a question: who is Imran khan
This is not a legal question.


In [27]:
# import necessary modules
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

# vectorize training data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['title'])
y_train = train_df['label']

# train decision tree model
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# get input from user
input_str = input("Enter a question: ")

# vectorize input string
X_input = vectorizer.transform([input_str])

# classify input string
predicted_label = clf.predict(X_input)[0]
if predicted_label == 1:
    print("This is a legal question.")
else:
    print("This is not a legal question.")


Enter a question: what should I do when I am in police station after committing a crime?
This is a legal question.


In [36]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

# create a Bagging Decision Tree classifier
bagging_classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                       n_estimators=10,
                                       random_state=42)

# vectorize training data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['title'])
y_train = train_df['label']

# train the classifier
bagging_classifier.fit(X_train, y_train)

# take input from the user
user_input = input("Enter your question: ")

# vectorize the user input
X_user = vectorizer.transform([user_input])

# predict the label of the user input
prediction = bagging_classifier.predict(X_user)

# print the predicted label
if prediction[0] == 1:
    print("This is a legal question.")
else:
    print("This is not a legal question.")


Enter your question:  what should I do when I am in police station after committing a crime?
This is not a legal question.
