# M.SARAYU




In [None]:
pip install pandas scikit-learn nltk matplotlib seaborn




In [2]:
# ============================
# 🧩 Task 5 - Dataset Loading
# ============================
# M.Sarayu

import pandas as pd

# Step 1: Load only first 50,000 rows for faster testing
df = pd.read_csv(
    "https://files.consumerfinance.gov/ccdb/complaints.csv.zip",
    compression="zip",
    low_memory=False,
    nrows=50000
)

# Step 2: Keep only relevant columns
df = df[['Consumer complaint narrative', 'Product']].dropna()

# Step 3: Define the 4 categories required by Kaiburr
category_map = {
    'Credit reporting, credit repair services, or other personal consumer reports': 0,
    'Debt collection': 1,
    'Consumer Loan': 2,
    'Mortgage': 3
}

# Step 4: Filter and map the categories
df = df[df['Product'].isin(category_map.keys())]
df['label'] = df['Product'].map(category_map)

# Step 5: Sample balanced data (e.g., 1500 rows per category)
df_balanced = (
    df.groupby('label', group_keys=False)
      .apply(lambda x: x.sample(min(len(x), 1500), random_state=42))
)

# Step 6: Save smaller dataset for reuse
df_balanced.to_csv("consumer_complaints_small.csv", index=False)

# Step 7: Verify
print("✅ Reduced dataset created!")
print(df_balanced['label'].value_counts())
print(df_balanced.head())


✅ Reduced dataset created!
label
0    435
1    357
3    102
2      3
Name: count, dtype: int64
                            Consumer complaint narrative  \
30393  In accordance with Fair Credit Reporting act X...   
11520  I RECENTLY OBTAINED A COPY OF MY CREDIT REPORT...   
14525  To : Whom It May concern : I am writing to dis...   
26783    XXXX XXXX CC ( MYLIFE ) XXXX PAID OFF XXXX OUT.   
36532  I do not knwo what is going on with this accou...   

                                                 Product  label  
30393  Credit reporting, credit repair services, or o...      0  
11520  Credit reporting, credit repair services, or o...      0  
14525  Credit reporting, credit repair services, or o...      0  
26783  Credit reporting, credit repair services, or o...      0  
36532  Credit reporting, credit repair services, or o...      0  


  .apply(lambda x: x.sample(min(len(x), 1500), random_state=42))


In [3]:
# M.Sarayu
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)   # remove punctuation/numbers
    text = text.lower()                        # lowercase
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

df_balanced['clean_text'] = df_balanced['Consumer complaint narrative'].apply(clean_text)
print(df_balanced[['clean_text', 'label']].head())


                                              clean_text  label
30393  accordance fair credit reporting act xxxx xxxx...      0
11520  recently obtained copy credit report accounts ...      0
14525  may concern writing dispute fraudulent charge ...      0
26783                 xxxx xxxx cc mylife xxxx paid xxxx      0
36532  knwo going account many lates absurd demand in...      0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#M.SARAYU
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['clean_text'], df_balanced['label'],
    test_size=0.2, random_state=42, stratify=df_balanced['label']
)


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF Shape:", X_train_tfidf.shape)


TF-IDF Shape: (815, 5000)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=300)
model.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=[
    'Credit Reporting', 'Debt Collection', 'Consumer Loan', 'Mortgage'
]))

Accuracy: 0.8382352941176471
                  precision    recall  f1-score   support

Credit Reporting       0.78      0.94      0.86       104
 Debt Collection       0.90      0.75      0.82        76
   Consumer Loan       0.00      0.00      0.00         2
        Mortgage       1.00      0.73      0.84        22

        accuracy                           0.84       204
       macro avg       0.67      0.60      0.63       204
    weighted avg       0.84      0.84      0.83       204



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
samples = [
    "My credit report has wrong information even after I disputed it.",
    "The debt collector keeps calling me even after payment.",
    "The bank changed my mortgage rate without notice."
]

pred = model.predict(vectorizer.transform(samples))
print(pred)
#M.SARAYU


[0 1 3]
