## This Notebook shows the data modelling process on the cleaned data

In [68]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer

In [10]:
data  = pd.read_csv("Cleaned_Consumer_Complaints.csv")
df = data.copy()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593746 entries, 0 to 593745
Data columns (total 11 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Product                       593746 non-null  object
 1   Issue                         593746 non-null  object
 2   Company                       593746 non-null  object
 3   State                         593746 non-null  object
 4   ZIP_code                      593746 non-null  object
 5   Consumer_consent_provided?    593746 non-null  object
 6   Submitted_via                 593746 non-null  object
 7   Date_sent_to_company          593746 non-null  object
 8   Company_response_to_consumer  593746 non-null  object
 9   Timely_response?              593746 non-null  object
 10  Consumer_disputed?            593746 non-null  object
dtypes: object(11)
memory usage: 49.8+ MB


In [12]:
df.head()

Unnamed: 0,Product,Issue,Company,State,ZIP_code,Consumer_consent_provided?,Submitted_via,Date_sent_to_company,Company_response_to_consumer,Timely_response?,Consumer_disputed?
0,Credit reporting,Incorrect information on credit report,EXPERIAN DELAWARE GP,TX,77075,Consent provided,Phone,03/21/2017,Closed with non-monetary relief,Yes,No
1,Debt collection,Disclosure verification of debt,"Security Credit Services, LLC",IL,60643,Consent provided,Web,04/20/2017,Closed with explanation,Yes,No
2,Credit card,Other,"CITIBANK, N.A.",IL,62025,Consent provided,Referral,04/20/2017,Closed with explanation,Yes,No
3,Mortgage,"Loan modification,collection,foreclosure","Shellpoint Partners, LLC",CA,90305,Consent provided,Referral,04/14/2017,Closed with explanation,Yes,No
4,Credit card,Credit determination,U.S. BANCORP,LA,70571,Consent provided,Postal mail,04/21/2017,Closed with explanation,Yes,No


### Data Preparation 

In [13]:
#encoding 
df['Consumer_disputed?'] = df['Consumer_disputed?'].replace({"Yes": 1, "No": 0})
df['Timely_response?'] = df['Timely_response?'].replace({"Yes": 1, "No": 0})

#dropping columns 
df.drop(columns=["ZIP_code", "Date_sent_to_company"], inplace=True)

In [18]:
from IPython.display import display

In [21]:
df.head()

Unnamed: 0,Product,Issue,Company,State,Consumer_consent_provided?,Submitted_via,Company_response_to_consumer,Timely_response?,Consumer_disputed?
0,Credit reporting,Incorrect information on credit report,EXPERIAN DELAWARE GP,TX,Consent provided,Phone,Closed with non-monetary relief,1,0
1,Debt collection,Disclosure verification of debt,"Security Credit Services, LLC",IL,Consent provided,Web,Closed with explanation,1,0
2,Credit card,Other,"CITIBANK, N.A.",IL,Consent provided,Referral,Closed with explanation,1,0
3,Mortgage,"Loan modification,collection,foreclosure","Shellpoint Partners, LLC",CA,Consent provided,Referral,Closed with explanation,1,0
4,Credit card,Credit determination,U.S. BANCORP,LA,Consent provided,Postal mail,Closed with explanation,1,0


In [26]:
# df_credit = df[df['Product'] == "Credit reporting"]
# df_credit['Consumer_consent_provided?'].value_counts()

In [70]:
#Splitting the data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_full_train = df_full_train['Consumer_disputed?'].values
y_train = df_train['Consumer_disputed?'].values
y_val = df_val['Consumer_disputed?'].values
y_test = df_test['Consumer_disputed?'].values

del df_full_train['Consumer_disputed?']
del df_train['Consumer_disputed?']
del df_val['Consumer_disputed?']
del df_test['Consumer_disputed?']

In [77]:
train_dict = df_train.to_dict(orient="records")
dv = DictVectorizer()
X_train = dv.fit_transform(train_dict)


In [81]:
X_train[0]

<1x3524 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>