In [1]:
import pandas as pd

In [2]:
df = pd.read_json('ca_test_data_final_OFFICIAL.jsonl', lines=True)


In [3]:
# Further refining the regex to uniformly capture code names like "Penal Code" or "Revenue and Taxation Code"
# This version directly targets known common code names and structures, aiming to exclude preceding text and qualifiers.

# Compile a list of known code names for a more direct matching approach
known_code_names = [
    "Penal Code", 
    "Revenue and Taxation Code", 
    "Public Resources Code", 
    "Probate Code",
    "Civil Code",
    "Health and Safety Code",
    "Education Code",
    "Elections Code",
    "Labor Code",
    "Family Code"
]

# Function to search for known code names in a title
def find_known_code_names(title, code_names):
    for code_name in code_names:
        if code_name in title:
            return code_name
    return ""


# Apply the function to each title
df['uniform_code_names'] = df['title'].apply(lambda x: find_known_code_names(x, known_code_names))

# Displaying the results with uniform code names
df[['title', 'uniform_code_names']].head()
known_code_names_updated = known_code_names + ["Government Code"]

# Reapply the function to identify code names, including the updated list
df['uniform_code_names_updated'] = df['title'].apply(lambda x: find_known_code_names(x, known_code_names_updated))

# Filter the dataframe again to include rows with the newly identified code names
df_filtered_updated = df[df['uniform_code_names_updated'] != ""]

# Display the updated shape of the dataframe to see the effect of including "Government Code"
updated_filtered_shape = df_filtered_updated.shape

# Adding "Welfare and Institutions Code" to the updated list of known code names
known_code_names_final = known_code_names_updated + ["Welfare and Institutions Code"]

# Reapply the function to identify code names, including the newly added "Welfare and Institutions Code"
df['uniform_code_names_final'] = df['title'].apply(lambda x: find_known_code_names(x, known_code_names_final))

# Filter the dataframe again to include rows with the newly identified code names
df_filtered_final = df[df['uniform_code_names_final'] != ""]

# Display the final shape of the dataframe to see the effect of including "Welfare and Institutions Code"
final_filtered_shape = df_filtered_final.shape

# Adding "Food and Agricultural Code" to the final list of known code names
known_code_names_complete = known_code_names_final + ["Food and Agricultural Code"]

# Reapply the function to identify code names, now including "Food and Agricultural Code"
df['uniform_code_names_complete'] = df['title'].apply(lambda x: find_known_code_names(x, known_code_names_complete))

# Filter the dataframe one more time to include rows with the newly identified code names
df_filtered_complete = df[df['uniform_code_names_complete'] != ""]

# Display the complete shape of the dataframe to see the effect of including "Food and Agricultural Code"
complete_filtered_shape = df_filtered_complete.shape

# Adding "Business and Professions Code" to the final list of known code names
known_code_names_ultimate = known_code_names_final + ["Business and Professions Code"]

# Reapply the function to identify code names, now including "Business and Professions Code"
df['uniform_code_names_ultimate'] = df['title'].apply(lambda x: find_known_code_names(x, known_code_names_ultimate))

# Filter the dataframe one last time to include rows with the newly identified code names
df_filtered_ultimate = df[df['uniform_code_names_ultimate'] != ""]

# Display the ultimate shape of the dataframe to see the effect of including "Business and Professions Code"
ultimate_filtered_shape = df_filtered_ultimate.shape

# Correcting the identification issue with "Military and Veterans Code"
# Ensure the name is correctly formatted and included in the list of code names

# Re-checking if "Military and Veterans Code" is correctly formatted in our final list
correct_code_name = "Military and Veterans Code"
if correct_code_name not in known_code_names_ultimate:
    known_code_names_ultimate.append(correct_code_name)

# Reapply the function to correctly identify code names, including "Military and Veterans Code" with correct handling
df['uniform_code_names_corrected'] = df['title'].apply(lambda x: find_known_code_names(x, known_code_names_ultimate))

# Checking if the issue with identifying "Military and Veterans Code" has been resolved
# Filtering to see if any rows are now correctly identifying "Military and Veterans Code"
df_resolved_issue = df[df['uniform_code_names_corrected'] == correct_code_name]

# Displaying the count of rows now correctly identifying "Military and Veterans Code"
resolved_count = df_resolved_issue.shape[0]

# Consolidating all uniform code names into one final column for clarity and simplicity
# Using the 'uniform_code_names_corrected' as the definitive column for code names

# Dropping previous columns related to code names to clean up the dataframe
columns_to_drop = [
    'uniform_code_names',
    'uniform_code_names_updated',
    'uniform_code_names_final',
    'uniform_code_names_complete',
    'uniform_code_names_ultimate'
]
df_cleaned = df.drop(columns=columns_to_drop)

# Renaming 'uniform_code_names_corrected' to a more descriptive and final column name
df_cleaned.rename(columns={'uniform_code_names_corrected': 'CodeName'}, inplace=True)
# Remove all rows where the 'Code Name' column is empty, indicating no code name was identified
df = df_cleaned[df_cleaned['CodeName'] != ""]

# Display the shape of the dataframe after removing rows without a code name
final_shape = df.shape



In [4]:


from sklearn.datasets import load_files
import pandas as pd


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

estimators = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=42),
    LinearSVC(),
    LogisticRegression(random_state=42),
    SVC(probability=True, random_state=42)
]

CV = 5

entries = []

for estimator in estimators:
    estimator_name = estimator.__class__.__name__

    pipe = Pipeline(
      [
          ("vectorizer", TfidfVectorizer()),
          ("estimator", estimator),
      ],
      verbose=False,
    )

    scores = cross_val_score(pipe, df["text"], df["CodeName"], scoring='accuracy', cv=CV)

    entries.append((estimator_name, scores.mean(), scores.std() * 2 ))

df_metrics = pd.DataFrame(entries, columns=["estimator_name", "mean", "standard_score"])



In [5]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


X, y = df["text"], df["CodeName"]

X_train, X_test, y_train, y_test= train_test_split(
    X, y, train_size=0.8, test_size=0.2
)

pipe = Pipeline(
    [
        ("vectorizer", TfidfVectorizer()),
        ("estimator", SVC(probability=True, random_state=42)),
    ],
    verbose=True,
)

pipe.fit(X_train, y_train)

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   1.2s
[Pipeline] ......... (step 2 of 2) Processing estimator, total=  37.8s


In [6]:
from sklearn.metrics import accuracy_score


y_pred = pipe.predict(X_test)
import pickle

with open("model.bin", "wb") as f:
    pickle.dump(pipe, f)

# This is the actual model shit

This is where the model is downloaded. Put the text in the obvious spot then it spits out a dictionary. You can change it to spit out the largest code. 

In [9]:
with open("model.bin", "rb") as f:
    loaded_pipe = pickle.load(f)

text = """

"""

y_pred = loaded_pipe.predict_proba([text])[0]

predict_proba = {
    key: round(value, 2)
    for (key, value) in zip(loaded_pipe.classes_, y_pred)
}

print(predict_proba)

{'Business and Professions Code': 0.09, 'Civil Code': 0.1, 'Education Code': 0.03, 'Elections Code': 0.01, 'Family Code': 0.04, 'Government Code': 0.11, 'Health and Safety Code': 0.08, 'Labor Code': 0.02, 'Military and Veterans Code': 0.01, 'Penal Code': 0.37, 'Probate Code': 0.03, 'Public Resources Code': 0.02, 'Revenue and Taxation Code': 0.02, 'Welfare and Institutions Code': 0.07}


This is how you do it with a dataframe

In [42]:
def predict_classification_with_prob(text, loaded_pipe):
    y_pred = loaded_pipe.predict_proba([text])[0]
    max_prob_index = y_pred.argmax()  # Find the index of the maximum probability
    most_likely_class = loaded_pipe.classes_[max_prob_index]  # Find the corresponding class name
    most_likely_prob = round(y_pred[max_prob_index], 2)  # Get the probability and round it
    return most_likely_class, most_likely_prob

# Apply the function to each row in the DataFrame and create two new columns
df2[['most_likely_classification', 'probability']] = df2['text'].apply(
    lambda x: pd.Series(predict_classification_with_prob(x, loaded_pipe))
)

In [43]:
df2

Unnamed: 0,bill_id,text,summary,title,text_len,sum_len,predictions,most_likely_classification,probability
0,110_hr37,SECTION 1. SHORT TITLE.\n\n This Act may be...,National Science Education Tax Incentive for B...,To amend the Internal Revenue Code of 1986 to ...,8494,321,Education Code,Education Code,0.36
1,112_hr2873,SECTION 1. SHORT TITLE.\n\n This Act may be...,Small Business Expansion and Hiring Act of 201...,To amend the Internal Revenue Code of 1986 to ...,6522,1424,Revenue and Taxation Code,Revenue and Taxation Code,0.63
2,109_s2408,SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...,Requires the Director of National Intelligence...,A bill to require the Director of National Int...,6154,463,Civil Code,Civil Code,0.30
3,108_s1899,SECTION 1. SHORT TITLE.\n\n This Act may be...,National Cancer Act of 2003 - Amends the Publi...,A bill to improve data collection and dissemin...,19853,1400,Health and Safety Code,Health and Safety Code,0.83
4,107_s1531,SECTION 1. SHORT TITLE.\n\n This Act may be...,Military Call-up Relief Act - Amends the Inter...,A bill to amend the Internal Revenue Code of 1...,6273,278,Government Code,Government Code,0.20
...,...,...,...,...,...,...,...,...,...
3264,114_hr3952,SECTION 1. SHORT TITLE.\n\n This Act may be...,Congenital Heart Futures Reauthorization Act o...,Congenital Heart Futures Reauthorization Act o...,7105,933,Health and Safety Code,Health and Safety Code,0.61
3265,113_s2439,SECTION 1. SHORT TITLE.\n\n This Act may be...,NSA Internal Watchdog Act - Amends the Inspect...,NSA Internal Watchdog Act,9556,2196,Government Code,Government Code,0.26
3266,103_hr4788,SECTION 1. SHORT TITLE.\n\n This Act may be...,Veterans' Health Care Eligibility Reform Act o...,Veterans' Health Care Eligibility Reform Act o...,18771,3046,Health and Safety Code,Health and Safety Code,0.37
3267,106_hr3702,SECTION 1. TEACHER RECRUITMENT.\n\n (a) Fut...,Makes available 500 scholarship grants and sti...,Recruit and Reward Future Math and Science Tea...,8311,1728,Education Code,Education Code,0.87


In [39]:
df2.to_csv('us_test_data_final_OFFICIAL.csv', index=False)

In [None]:
df.to_csv('ca_test_data_final_OFFICIAL.csv', index=False)