In [1]:
import requests
import csv
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
BASE_URL = "https://datahub.transportation.gov/resource/6axg-epim.json"
LIMIT = 1000
OUTPUT_FILE = "vehicle_recalls.csv"

def fetch_recalls():
    all_data = []
    offset = 0

    while True:
        url = f"{BASE_URL}?$limit={LIMIT}&$offset={offset}"
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()

            if not data:
                break  # Stop if no more data is returned

            all_data.extend(data)
            offset += LIMIT  # Move to the next batch
            time.sleep(1)  # Avoid rate limiting

        except requests.exceptions.RequestException as e:
            print(f"Error fetching recall data: {e}")
            break

    return all_data

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    all_keys = set()
    for entry in data:
        all_keys.update(entry.keys())

    with open(filename, "w", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=list(all_keys))
        writer.writeheader()
        for entry in data:
            writer.writerow({key: entry.get(key, '') for key in all_keys})

In [3]:
if __name__ == "__main__":
    recalls = fetch_recalls()
    save_to_csv(recalls, OUTPUT_FILE)
    print(f"Recall data saved to {OUTPUT_FILE}")

Recall data saved to vehicle_recalls.csv


In [None]:

import pandas as pd
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load and clean the dataset
df = pd.read_csv("vehicle_recalls.csv")
df['report_received_date'] = pd.to_datetime(df['report_received_date'], errors='coerce')
df['year'] = df['report_received_date'].dt.year
df.dropna(subset=['year'], inplace=True)
df['year'] = df['year'].astype(int)

# Fill missing values
df['completion_rate'].fillna(df['completion_rate'].median(), inplace=True)
df['potentially_affected'].fillna(df['potentially_affected'].median(), inplace=True)
text_cols = ['consequence_summary', 'corrective_action', 'defect_summary']
df[text_cols] = df[text_cols].fillna('Unknown')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['completion_rate'].fillna(df['completion_rate'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['potentially_affected'].fillna(df['potentially_affected'].median(), inplace=True)


In [None]:

# Feature Engineering
from sklearn.preprocessing import LabelEncoder
le_manufacturer = LabelEncoder()
df['manufacturer_encoded'] = le_manufacturer.fit_transform(df['manufacturer'])

le_component = LabelEncoder()
df['component_encoded'] = le_component.fit_transform(df['component'])

df['recall_severity'] = df['potentially_affected'] * (1 - df['completion_rate'] / 100)

# Prepare dataset for modeling
features = df.groupby('year').agg({
    'potentially_affected': 'sum',
    'recall_severity': 'sum',
    'manufacturer_encoded': 'nunique',
    'component_encoded': 'nunique'
}).reset_index()

X = features[['year', 'recall_severity', 'manufacturer_encoded', 'component_encoded']]
y = features['potentially_affected']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate Model
y_pred = rf_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model MAE: {mae}")


Model MAE: 4881290.908333332


In [None]:

# Function to process natural language queries
def extract_entities(query):
    query = query.lower()
    entities = {"year": None, "component": None, "intent": None}

    year_match = re.search(r'\b(20\d{2})\b', query)
    if year_match:
        entities["year"] = int(year_match.group())

    component_keywords = set(df['component'].str.lower().unique())
    for word in query.split():
        if word in component_keywords:
            entities["component"] = word

    if "forecast" in query or "demand" in query:
        entities["intent"] = "forecast"
    elif "trend" in query or "recalls over time" in query:
        entities["intent"] = "trend"
    elif "highest recalls" in query or "top manufacturers" in query:
        entities["intent"] = "top_manufacturer"
    elif "how many recalls" in query or "total recalls" in query:
        entities["intent"] = "recall_count"

    return entities

def process_nlq(query):
    entities = extract_entities(query)
    year, component, intent = entities["year"], entities["component"], entities["intent"]

    if intent == "forecast":
        if year:
            result = rf_model.predict([[year, X['recall_severity'].mean(), X['manufacturer_encoded'].mean(), X['component_encoded'].mean()]])
            return f"Predicted spare part demand for {year}: {int(result[0])} units."
        else:
            return "Please specify a year for demand forecasting."

    elif intent == "top_manufacturer":
        if year:
            top_mfr = df[df["year"] == year].groupby("manufacturer")["potentially_affected"].sum().idxmax()
            return f"The manufacturer with the highest recalls in {year} is {top_mfr}."
        else:
            top_mfr = df.groupby("manufacturer")["potentially_affected"].sum().idxmax()
            return f"The manufacturer with the highest recalls overall is {top_mfr}."

    elif intent == "trend":
        return "Displaying recall trends over time..."

    elif intent == "recall_count":
        if year and component:
            count = df[(df["year"] == year) & (df["component"].str.lower() == component)]["potentially_affected"].sum()
            return f"Total recalls for {component} in {year}: {int(count)} vehicles."
        elif year:
            count = df[df["year"] == year]["potentially_affected"].sum()
            return f"Total recalls in {year}: {int(count)} vehicles."
        else:
            return "Please specify a year to get recall count information."

    return "Sorry, I couldn't understand your query. Try asking about forecasts, trends, or top manufacturers."


In [None]:

# Example Queries
sample_queries = [
    "What is the expected spare part demand for 2029?",
    "Which manufacturer had the highest recalls in 2021?",
    "Show me the recall trend for brakes over the last 3 years.",
    "How many recalls were there for airbags in 2018?"
]

# Process Queries
for query in sample_queries:
    print(f"Query: {query}")
    print(f"Response: {process_nlq(query)}")
    print("-" * 50)


Query: What is the expected spare part demand for 2029?
Response: Predicted spare part demand for 2029: 26267248 units.
--------------------------------------------------
Query: Which manufacturer had the highest recalls in 2021?
Response: The manufacturer with the highest recalls in 2021 is General Motors, LLC.
--------------------------------------------------
Query: Show me the recall trend for brakes over the last 3 years.
Response: Displaying recall trends over time...
--------------------------------------------------
Query: How many recalls were there for airbags in 2018?
Response: Total recalls in 2018: 38268202 vehicles.
--------------------------------------------------


