In [None]:
import pandas as pd

# Fallback to the uploaded file since ../data/rows.csv is missing
try:
    df = pd.read_csv("/content/rows.csv", low_memory=False)
except FileNotFoundError:
    print("Warning: rows.csv not found. Using household_power_consumption.txt instead.")
    df = pd.read_csv("household_power_consumption.txt", sep=";", low_memory=False)

# Show first 5 rows and verify columns
display(df.head())
print("Columns:", df.columns.tolist())

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,05/10/2019,Checking or savings account,Checking account,Managing an account,Problem using a debit or ATM card,,,NAVY FEDERAL CREDIT UNION,FL,328XX,Older American,,Web,05/10/2019,In progress,Yes,,3238275
1,05/10/2019,Checking or savings account,Other banking product or service,Managing an account,Deposits and withdrawals,,,BOEING EMPLOYEES CREDIT UNION,WA,98204,,,Referral,05/10/2019,Closed with explanation,Yes,,3238228
2,05/10/2019,Debt collection,Payday loan debt,Communication tactics,Frequent or repeated calls,,,CURO Intermediate Holdings,TX,751XX,,,Web,05/10/2019,Closed with explanation,Yes,,3237964
3,05/10/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Old information reappears or never goes away,,,Ad Astra Recovery Services Inc,LA,708XX,,,Web,05/10/2019,Closed with explanation,Yes,,3238479
4,05/10/2019,Checking or savings account,Checking account,Managing an account,Banking errors,,,ALLY FINANCIAL INC.,AZ,85205,,,Postal mail,05/10/2019,In progress,Yes,,3238460


Columns: ['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID']


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1282355 entries, 0 to 1282354
Data columns (total 18 columns):
 #   Column                        Non-Null Count    Dtype 
---  ------                        --------------    ----- 
 0   Date received                 1282355 non-null  object
 1   Product                       1282355 non-null  object
 2   Sub-product                   1047189 non-null  object
 3   Issue                         1282355 non-null  object
 4   Sub-issue                     751169 non-null   object
 5   Consumer complaint narrative  383564 non-null   object
 6   Company public response       449082 non-null   object
 7   Company                       1282355 non-null  object
 8   State                         1262955 non-null  object
 9   ZIP code                      1167057 non-null  object
 10  Tags                          175643 non-null   object
 11  Consumer consent provided?    690654 non-null   object
 12  Submitted via                 1282355 non-

In [None]:
df.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [None]:
# Determine the correct column to use
target_col = "Consumer complaint narrative" if "Consumer complaint narrative" in df.columns else "Global_active_power"

# Keep only rows where the target column exists and is not null
df_text = df[df[target_col].notna()].copy()

# Check new shape
print(f"Rows processed: {df_text.shape[0]}")

Rows processed: 383564


In [None]:
# Extract the target column as strings
texts = df_text[target_col].astype(str)

display(texts.head())

Unnamed: 0,Consumer complaint narrative
29904,The Summer of XX/XX/2018 I was denied a mortga...
30629,There are many mistakes appear in my report wi...
30735,There are many mistakes appear in my report wi...
30795,There are many mistakes appear in my report wi...
30807,There are many mistakes appear in my report wi...


In [None]:
# Basic cleaning: lowercase and strip whitespace
texts = texts.str.lower().str.strip()

display(texts.head())

Unnamed: 0,Consumer complaint narrative
29904,the summer of xx/xx/2018 i was denied a mortga...
30629,there are many mistakes appear in my report wi...
30735,there are many mistakes appear in my report wi...
30795,there are many mistakes appear in my report wi...
30807,there are many mistakes appear in my report wi...


In [None]:
# Remove duplicates and reset index for a clean start
texts = texts.drop_duplicates().reset_index(drop=True)

print(f"Unique complaints: {len(texts)}")

Unique complaints: 366852


# Task
Perform NLP preprocessing on a sample of data from "/content/household_power_consumption.txt" by loading the 'en_core_web_sm' spaCy model, defining a cleaning function to handle lemmatization and removal of stopwords and punctuation, and applying this to a random selection of 10,000 entries from the 'texts' Series to provide a summary of the cleaned text results.

## Load spaCy Model

### Subtask:
Install (if necessary) and load the 'en_core_web_sm' spaCy model.


**Reasoning**:
I will import the spacy library and attempt to load the 'en_core_web_sm' model, providing a fallback download command in case it is missing.



In [None]:
import spacy

try:
    # Attempt to load the spaCy model
    nlp = spacy.load('en_core_web_sm')
    print('spaCy model en_core_web_sm loaded successfully.')
except OSError:
    # Download the model if it's not found
    print('Model not found. Downloading en_core_web_sm...')
    !python -m spacy download en_core_web_sm
    nlp = spacy.load('en_core_web_sm')
    print('spaCy model en_core_web_sm downloaded and loaded successfully.')

spaCy model en_core_web_sm loaded successfully.


## Define Cleaning Function

### Subtask:
Create a `clean_text` function that uses the spaCy model to preprocess text data by removing noise and lemmatizing tokens.


**Reasoning**:
I will define the `clean_text` function to process text using spaCy, filtering for alphabetic tokens, removing stop words, punctuation, and whitespace, and returning the lemmatized result.



In [None]:
def clean_text(text):
    """
    Cleans input text using spaCy: removes stopwords, punctuation,
    and non-alphabetic tokens, then returns lemmatized lowercase string.
    """
    # Process the text using the pre-loaded nlp object
    doc = nlp(text)

    # Filter tokens and collect lemmatized forms
    # Criteria: not a stop word, not punctuation, not whitespace, and is alphabetic
    cleaned_tokens = [
        token.lemma_.lower()
        for token in doc
        if not token.is_stop and not token.is_punct and not token.is_space and token.is_alpha
    ]

    # Join tokens into a single string
    return " ".join(cleaned_tokens)

print("Function 'clean_text' defined successfully.")

Function 'clean_text' defined successfully.


## Apply NLP Cleaning to Sample

### Subtask:
Select a random sample of 10,000 entries from the 'texts' Series and apply the `clean_text` function to process them, displaying the first few results to verify the cleaning logic.


**Reasoning**:
I will sample 10,000 entries from the 'texts' Series (or all available if fewer than 10,000), apply the 'clean_text' function to each, and display the first 10 results to verify the cleaning logic.



In [None]:
# Determine the sample size based on available data
sample_size = min(len(texts), 10000)

# Sample the data for NLP processing
texts_sample = texts.sample(n=sample_size, random_state=42)

# Apply the cleaning function
cleaned_texts_sample = texts_sample.apply(clean_text)

# Display the results
print(f"Processed {sample_size} entries.")
print("\nFirst 10 cleaned results:")
display(cleaned_texts_sample.head(10))

Processed 10000 entries.

First 10 cleaned results:


Unnamed: 0,Consumer complaint narrative
303192,decline credit limit increase base recent chan...
298497,suppose debt xxxx xxxx discover open collectio...
65033,payment outstanding parking balance owe xx web...
268627,transunion show collection xxxx account open i...
109016,xx xx xxxx sunday receive email xxxx xxxx xxxx...
72662,complaint complaint help investigation process...
163981,affect equifax data breach
32419,letter complaint response xxxx predatory pract...
189426,intentionally deliberately fail disclose xxxx ...
179026,real estate agent buyer pre approve american f...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_df=0.9,        # ignore words that appear in more than 90% of documents
    min_df=10,         # ignore words that appear in fewer than 10 documents
    max_features=5000  # only keep the top 5000 most important words
)

# Transform the cleaned text sample into a matrix
tfidf_matrix = tfidf.fit_transform(cleaned_texts_sample)

# Check the matrix shape
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")

TF-IDF Matrix Shape: (10000, 3201)


In [None]:
# Inspect the first 20 words in the vocabulary
feature_names = tfidf.get_feature_names_out()
print("First 20 feature names:")
print(feature_names[:20])

First 20 feature names:
['abandon' 'abide' 'ability' 'able' 'abruptly' 'absence' 'absolute'
 'absolutely' 'absurd' 'abuse' 'abusive' 'ac' 'acc' 'accelerate' 'accent'
 'accept' 'acceptable' 'acceptance' 'access' 'accident']


In [None]:
from sklearn.decomposition import NMF

In [None]:
# Initialize NMF with 5 topics
nmf_model = NMF(n_components=5, random_state=42)

# Fit the model to our TF-IDF matrix
nmf_model.fit(tfidf_matrix)

print("NMF Model trained successfully with 5 topics.")

NMF Model trained successfully with 5 topics.


In [None]:
def display_topics(model, feature_names, no_top_words):
    """Displays the top words for each topic in the model."""
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        # Sort indices of the topic weights and get the top 'no_top_words'
        top_indices = topic.argsort()[:-no_top_words - 1:-1]
        print(" ".join([feature_names[i] for i in top_indices]))

# Display the top 10 words for our 5 topics
display_topics(nmf_model, feature_names, 10)


Topic 1:
xxxx number address date follow file receive case court sell

Topic 2:
payment loan pay bank tell call month account time charge

Topic 3:
xx xxxx payment inquiry date receive late letter day send

Topic 4:
report credit account remove information dispute equifax inquiry experian reporting

Topic 5:
debt collection company owe call letter send collect receive agency


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# LDA works better with raw term counts
count_vectorizer = CountVectorizer(
    max_df=0.9,
    min_df=10,
    max_features=5000
)

# Create Document-Term Matrix
count_matrix = count_vectorizer.fit_transform(cleaned_texts_sample)

feature_names_count = count_vectorizer.get_feature_names_out()
print(f"Count Matrix Shape: {count_matrix.shape}")

Count Matrix Shape: (10000, 3201)


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Initialize LDA with 5 topics
lda_model = LatentDirichletAllocation(
    n_components=5,
    random_state=42
)

# Fit the model to our Count Matrix
lda_model.fit(count_matrix)

print("LDA Model trained successfully.")

LDA Model trained successfully.


In [None]:
# Reuse the display_topics function defined earlier
print("LDA Topics:")
display_topics(lda_model, feature_names_count, 10)

LDA Topics:

Topic 1:
xxxx debt account credit information report collection provide dispute request

Topic 2:
credit report account xxxx remove information dispute equifax score late

Topic 3:
xxxx account call bank card tell charge pay say receive

Topic 4:
loan payment mortgage pay time xxxx home month year tell

Topic 5:
xxxx xx payment date receive pay company send account insurance
