# Vehicle Complaints NLP Analysis

## Package Loading

In [7]:
import pandas as pd
import numpy as np
import re
import spacy
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

## Dataset Inspection

In [8]:
df = pd.read_parquet("complaints_2015_cleaned.parquet")
print(df.head())

  CMPLID   ODINO             MFR_NAME MAKETXT      MODELTXT YEARTXT CRASH  \
0  79432  516612  General Motors, LLC     GMC  ENVOY DENALI    2007     N   
1  79433  516612  General Motors, LLC     GMC  ENVOY DENALI    2007     N   
2  79434  516612  General Motors, LLC     GMC  ENVOY DENALI    2007     N   
3  79435  516612  General Motors, LLC     GMC  ENVOY DENALI    2007     N   
4  79436  516612  General Motors, LLC     GMC  ENVOY DENALI    2007     N   

   FAILDATE FIRE INJURED  ...     LDATE  \
0  20150825    N       0  ...  20160106   
1  20150825    N       0  ...  20160106   
2  20150825    N       0  ...  20160106   
3  20150825    N       0  ...  20160106   
4  20150825    N       0  ...  20160106   

                                              CDESCR CMPL_TYPE POLICE_RPT_YN  \
0  WIRING TO FUEL PUMP BURNT...2007 GMC ENVOY DEN...      EVOQ             N   
1  WIRING TO FUEL PUMP BURNT...2007 GMC ENVOY DEN...      EVOQ             N   
2  WIRING TO FUEL PUMP BURNT...2007 GM

In [9]:
df.loc[0, 'CDESCR']

'WIRING TO FUEL PUMP BURNT...2007 GMC ENVOY DENALI  WHEN DEAD ON HIGHWAY. ..UPDATED 01/07/16 *BF    WHITE BEAR LAKE SUPERSTORE CORRECTED THE PROBLEM BY INSTALLING A NEW ALTERNATOR, IGNITION SWITCH, AND REPLACING THE BATTERY.  *TR'

In [10]:
df.loc[0, 'COMPDESC']

'FUEL SYSTEM, GASOLINE:DELIVERY:FUEL PUMP'

In [11]:
df.shape

(872933, 25)

## Basic Unstructured Data Analysis

### Step 1: Clean & Normalize the CDESCR Narratives

In [12]:

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])  # NER comes later

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\*?[A-Z]{2,}\s*|\*+[A-Z]+\*+", "", text)  # remove update tags like '*BF', '*TR'
    text = re.sub(r"[^\w\s]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # remove extra whitespace
    doc = nlp(text.strip())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

tqdm.pandas()
df["cd_text"] = df["CDESCR"].fillna("").progress_apply(preprocess_text)

100%|██████████| 872933/872933 [2:02:57<00:00, 118.33it/s]    


In [13]:
df.to_parquet("complaints_with_cdtext.parquet", index=False)
df = pd.read_parquet("complaints_with_cdtext.parquet")

### Step 2: Extract Named Entities and Key Terms

In [14]:

nlp = spacy.load("en_core_web_sm")

texts = df["CDESCR"].fillna("").tolist()

entities_list = []
for doc in tqdm(nlp.pipe(texts, batch_size=100), total=len(texts)):
    ents = [ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "GPE", "NORP"]]
    entities_list.append(ents)

df["entities"] = entities_list

100%|██████████| 872933/872933 [45:53<00:00, 317.06it/s]


In [15]:
df.to_parquet("complaints_with_entities.parquet", index=False)
df = pd.read_parquet("complaints_with_entities.parquet")