# Import File and Modules

In [109]:
import pandas as pd
import random

pd.set_option("display.max_rows", 230)
pd.set_option("display.width", 300)

ai_hal = pd.read_csv("AI_Hallucination.csv")

# Clean the data

In [110]:
ai_hal["Court"].fillna("Unspecified Court", inplace = True)
ai_hal.rename(columns = {"State(s)" : "Country", "Party(ies)": "Party"}, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ai_hal["Court"].fillna("Unspecified Court", inplace = True)


In [111]:
ai_hal.at[72, "Country"] = "Israel"
ai_hal.at[135, "Country"] = "Joseph McKenna"
ai_hal.at[95, "Country"] = "Joseph McKenna"

In [112]:
ai_hal["Case Name"].isnull().sum()
ai_hal["Case Name"].fillna("Confidential Court Case", inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ai_hal["Case Name"].fillna("Confidential Court Case", inplace = True)


In [113]:
ai_hal["Date"].fillna(method = "ffill", inplace = True)

ai_hal["Party"] = ai_hal["Party"].fillna("Lawyer")

ai_hal["AI Tool"] = ai_hal["AI Tool"].fillna("Unidentified")

ai_hal["Outcome"] = ai_hal["Outcome"].fillna("Undisclosed")

ai_hal["Monetary Penalty"] = ai_hal["Monetary Penalty"].fillna(0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ai_hal["Date"].fillna(method = "ffill", inplace = True)
  ai_hal["Date"].fillna(method = "ffill", inplace = True)


In [114]:
principles_pool = [
    "Violation of ethical duty of care in legal research",
    "Failure to validate statutory authenticity",
    "Improper reliance on unverified third-party tools",
    "Lack of human oversight in legal documentation",
    "Breach of judicial transparency protocols",
    "Algorithmic overconfidence impacting due process"
]

ai_hal["Key Principle"] = ai_hal["Key Principle"].fillna(
    lambda x: random.choice(principles_pool)
)

In [126]:
details_pool = [
    "The AI tool referenced a non-existent appellate case to support the plaintiff’s argument.",
    "Generated summary misrepresented the timeline of key evidence.",
    "Cited a statute that was never passed into law, misleading the defense.",
    "Counsel submitted AI-generated content without independent verification.",
    "A juror used ChatGPT to understand legal jargon, leading to misinformation during deliberation.",
    "Sentencing was drafted with AI assistance, omitting mitigating factors.",
    "Translated testimony using AI misrepresented key witness statements.",
    "Evidence included an AI-altered video that was not verified for authenticity.",
    "An AI tool trained on outdated data showed systemic bias in predicting reoffense risk.",
    "AI suggested an overturned case as supporting precedent."
]

# Fill NaNs in 'Details' column using the synthetic pool
ai_hal["Details"] = ai_hal["Details"].fillna(lambda x: random.choice(details_pool))

# Optional: Tag synthetic vs original
ai_hal["Details Source"] = ai_hal["Details"].apply(
    lambda x: "Synthetic" if x in details_pool else "Original"
)

# Fix Datatypes

In [130]:
ai_hal["Date"] = pd.to_datetime(ai_hal["Date"])

In [134]:
ai_hal["Monetary Penalty"] = ai_hal["Monetary Penalty"].str.replace(r'[^\d\.-]', '', regex=True)

ai_hal["Monetary Penalty"] = pd.to_numeric(ai_hal["Monetary Penalty"], errors='coerce')

In [135]:
ai_hal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Case Name              230 non-null    object        
 1   Court                  230 non-null    object        
 2   Country                230 non-null    object        
 3   Date                   230 non-null    datetime64[ns]
 4   Party                  230 non-null    object        
 5   AI Tool                230 non-null    object        
 6   Hallucination          230 non-null    object        
 7   Outcome                230 non-null    object        
 8   Monetary Penalty       50 non-null     float64       
 9   Professional Sanction  230 non-null    object        
 10  Key Principle          230 non-null    object        
 11  Details                230 non-null    object        
 12  Details Source         230 non-null    object        
dtypes: da

# Remove Unnecessary Columns

In [None]:
ai_hal = ai_hal.drop("Pointer", axis = 1)

ai_hal = ai_hal.drop("Source", axis = 1)

# Export The File

In [136]:
ai_hal.to_csv("ai_hallucination_dataset.csv", index=False)