In [None]:
import kagglehub

# Download latest version
path = kagglehub.model_download("mistral-ai/mistral/pyTorch/7b-instruct-v0.1-hf")

print("Path to model files:", path)

Downloading from https://www.kaggle.com/api/v1/models/mistral-ai/mistral/pyTorch/7b-instruct-v0.1-hf/1/download...
100%|██████████| 11.1G/11.1G [11:32<00:00, 17.2MB/s]
Extracting model files...


Path to model files: /root/.cache/kagglehub/models/mistral-ai/mistral/pyTorch/7b-instruct-v0.1-hf/1


In [None]:
# Dependencies installation to run Mistral
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Data Analysis
import pandas as pd

# Data Structures
import json
from IPython.display import Markdown

# LLM
import torch
import bitsandbytes
import accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, pipeline

pd.set_option('display.max_colwidth', None)

In [None]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:

import opendatasets as od

# Specify the Kaggle dataset URL or ID
dataset_url = 'https://www.kaggle.com/datasets/keithgalli/freedmens-bureau-historical-documents'

# Download the dataset
od.download(dataset_url)


Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: syedosamaalishah093
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/keithgalli/freedmens-bureau-historical-documents
Downloading freedmens-bureau-historical-documents.zip to ./freedmens-bureau-historical-documents


100%|██████████| 57.0M/57.0M [00:04<00:00, 12.8MB/s]





In [None]:
def open_data(name:str) -> pd.DataFrame:
    df = pd.read_csv(f"/content/freedmens-bureau-historical-documents/{name}.csv")
    df['transcription_text'] = df['transcription_text'].str.replace("_x000D_", "")
    return df


def sample_data_and_print(df:pd.DataFrame):
    row = df.sample(1)
    text = f"Index: {row.index[0]} \n"
    for c, v in zip(row.columns.tolist()[2:], row.values.tolist()[0][1:]):
        text +=  f"\n {c}: {v} \n"
    return Markdown(text)

In [None]:
data = open_data("property-records")

In [None]:
sample_data_and_print(data)

Index: 3431 

 sub-category: Records - Property 

 transcription_text: Records - Property 

 document_url: Beaufort S. C. June 23rd 1866
To Brevt Major Gena Scott 
I have received from Major Alvord- Superintendent of Schools in this place- an offer to rent the two schoolhouses, one belonging to the Trustees of the "Beaufort College" the other to the Trustees of the "Beaufort Female Seminary." - When I last had the pleasure of seeing you, I understood you to say that you would restore these buildings, as soon as an arrangement was made with the Superintendent. 

Will you be so good as to include in the order for the restoration of those schoolhouses, all the buildings on the Lot of the "Female Seminary" known on the Plat of the U.S. Direct Tax Commissioners- as Block 90 Lot A- and also the Teachers dwelling belonging to the "Beaufort College- known on the plat as Block 67. Lot A. and as there have been two or three shanties erected without authority on the Lot please order that they be removed within two months. this if you think proper you may leave to the discretion of the Agent of the Bureau here. - A certificate from the U.S. Direct Tax Commissioners, stating that they had never exercised any control this Lot, was sent with the application which was previously made for the restoration of these buildings and is probably in your Office or that of Genl. Sickles'. 
Yours respectfully Hm Stuart Senr 
Trustee 


In [None]:
data = open_data("court-records")
data = data[data['sub-category'] == "Complaints"].reset_index()
df = data.copy()
df.shape

(1276, 6)

In [None]:
no_table_df = df[~df['transcription_text'].str.contains(r"\|.*\|", regex= True)]
no_table_df.shape

(1062, 6)

In [None]:
lower_case_starts = (no_table_df['transcription_text'].str.split().str[0].str.isalpha()) & \
                      (no_table_df['transcription_text'].str.split().str[0].str.islower())

lower_case_starts_df = no_table_df[lower_case_starts]

# I want to check if those rows are really the continuation of the previous row.
idxs = set(list(lower_case_starts_df['index']) + list(lower_case_starts_df['index'] - 1))
idx_df = pd.DataFrame(index=list(idxs)).sort_index().reset_index()
merged_df = idx_df.merge(df, how="left", on="index")
merged_df = merged_df['transcription_text'].set_axis(merged_df['index']).to_frame()

# Luckily, I found a row that was actually the continuation of the previous row
# which starts with a number.
merged_df.loc[534, :] = df.loc[534, 'transcription_text']
merged_df = merged_df.sort_index()

In [None]:
temp = merged_df.reset_index().copy()
temp['group'] = temp['index'] - 512 - temp.index # 512 is the lowest index in df
temp['transcription_text'] = temp.groupby('group')['transcription_text'].transform(lambda x: "".join(x))
temp = temp.drop_duplicates(subset='group', keep='first')
joined_rows_df = temp[['index', 'transcription_text']].set_axis(temp['index']).drop('index', axis=1)
joined_rows_df['transcription_text'] = joined_rows_df['transcription_text'].str.replace(r"\[\[.*?\]\]", "", regex = True)

In [None]:
complaints = pd.concat([no_table_df[~no_table_df['index'].isin(merged_df.index)], joined_rows_df.reset_index()])
complaints = complaints[['index', 'transcription_text']].set_axis(complaints['index']).drop('index', axis=1).sort_index()

In [None]:
MODEL_PATH = "/root/.cache/kagglehub/models/mistral-ai/mistral/pyTorch/7b-instruct-v0.1-hf/1"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
prompt = f"""
Get a brief complaint description, complainant, respondent, place of incident and date in YYYY-MM-DD format. Save it in a JSON object for each complaint. ONLY RETRIEVE THE ARRAY OF JSON, NOTHING ELSE.
DON'T CREATE MORE FIELDS THAN THOSE SPECIFIED, FILL WITH NULL THOSE MISSING. Here is the document you want to analyze:
{complaints['transcription_text'].sample(1).values[0]}
"""

messages = [
    {"role": "user", "content": prompt}
]

model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] 
Get a brief complaint description, complainant, respondent, place of incident and date in YYYY-MM-DD format. Save it in a JSON object for each complaint. ONLY RETRIEVE THE ARRAY OF JSON, NOTHING ELSE.
DON'T CREATE MORE FIELDS THAN THOSE SPECIFIED, FILL WITH NULL THOSE MISSING. Here is the document you want to analyze:
115
Case Davis refuses to divide the Crops according to contract following letter to Justice Lindsay.

Greensboro N.c. Nov 4th 1867.
Lindsay Mr Hamilton
Justice of the Peace
Guilford Co N.C.

Sir
You will at once investigate the difficulty between William Webb and Franklin Harris of Guilford Co. Have the matter fairly treated and divide the Crops according to Contract. My desire is to put both parties under Oath. After deciding the case you will report the result of your proceedings to this office
Respectfully
Hugo Hillebrandt
Capt V.R.C. Sub
Asst Commissioner

[[left margin]]
Settled by Mr Lindsay
Justice of the Peace.
Satisfactorily
See re