# Import библиотек

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
import nltk
import re
from nltk.corpus import stopwords
import string

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Набор данных

In [3]:
df = pd.read_csv('/content/drive/MyDrive/ML-projects/consumercomplaints.csv')
df

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
0,0,2022-11-11,Mortgage,Conventional home mortgage,Trouble during payment process,,
1,1,2022-11-23,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
2,2,2022-11-16,Mortgage,VA mortgage,Trouble during payment process,,
3,3,2022-11-15,Checking or savings account,Checking account,Managing an account,Fee problem,"Hi, I have been banking with Wells Fargo for o..."
4,4,2022-11-07,Mortgage,Other type of mortgage,Trouble during payment process,,
...,...,...,...,...,...,...,...
3101964,3101964,2017-02-09,Debt collection,I do not know,Cont'd attempts collect debt not owed,Debt resulted from identity theft,I have disputed my debts several times with no...
3101965,3101965,2015-04-29,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,My father died in XX/XX/XXXX. Left me his only...
3101966,3101966,2017-03-31,Credit reporting,,Credit reporting company's investigation,No notice of investigation status/result,cfbp i would Like to file a complaint on Exper...
3101967,3101967,2017-01-16,Credit reporting,,Incorrect information on credit report,Account status,My husband and I are in the middle of an FHA S...


## Анализ признаков

In [4]:
#Unnamed: 0 - не нужен
df = df.drop("Unnamed: 0",axis=1)
#Date received - дата получения жалобы
#Product - тип продукта, на который человек оставляет жалобу
#Sub-product - подпродукт, на который человек оставляет жалобу
#Issue - тема жалобы
#Sub-issue - подтема жалобы
#Consumer complaint narrative - жалоба

###Разберемся с типами данных

In [5]:
#Работаем с текстовыми данными, поэтому только такой формат
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3101969 entries, 0 to 3101968
Data columns (total 6 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Date received                 object
 1   Product                       object
 2   Sub-product                   object
 3   Issue                         object
 4   Sub-issue                     object
 5   Consumer complaint narrative  object
dtypes: object(6)
memory usage: 142.0+ MB


###Проверим df на наличие дубликатов и пропусков

In [6]:
# Довольно странная ситуация: с одной стороны могло произойти какое событие, что в один день было несколько одинаковых жалоб,
#с другой стороны человек мог оставить несколько жалоб в один день из-за ошибки программы
df.duplicated().sum()

1676798

In [7]:
# Разберемся, где у нас пропуска:
# столбцы Sub-product и Sub-issue не обязательные поля, многие люди скорее всего просто оставляют тему и переходят к детальному описанию, также и с продуктом
# столбец Consumer complaint narrative содержит пустые значения, что может быть вызвано случайной отправкой или многие люди просто не захотели расписывать проблему
df.isnull().sum()

Date received                         0
Product                               0
Sub-product                      235294
Issue                                 0
Sub-issue                        683355
Consumer complaint narrative    1987977
dtype: int64

In [8]:
df.drop(df.loc[df['Consumer complaint narrative'].isnull() == True].index, inplace=True)
df.isnull().sum()

Date received                        0
Product                              0
Sub-product                      52208
Issue                                0
Sub-issue                       194561
Consumer complaint narrative         0
dtype: int64

## Рассмотрим основную статистику по жалобам

In [9]:
# Основные жалобы поступают по продукту Credit reporting, credit repair services, or other personal consumer reports и Debt collection
df['Product'].value_counts(normalize=True).mul(100).round(1)

Credit reporting, credit repair services, or other personal consumer reports    46.3
Debt collection                                                                 17.2
Mortgage                                                                         8.8
Credit card or prepaid card                                                      7.3
Checking or savings account                                                      4.9
Student loan                                                                     2.9
Credit reporting                                                                 2.8
Money transfer, virtual currency, or money service                               2.4
Vehicle loan or lease                                                            1.8
Credit card                                                                      1.7
Bank account or service                                                          1.3
Payday loan, title loan, or personal loan                        

# Обучение модели

## Предобработка данных

In [10]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
# Очищение жалоб от лишних символов
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [12]:
df["Consumer complaint narrative"] = df["Consumer complaint narrative"].apply(clean)

## SGDClassifier

In [13]:
df = df[["Consumer complaint narrative", "Product"]]
x = np.array(df["Consumer complaint narrative"])
y = np.array(df["Product"])

cv = CountVectorizer()
X = cv.fit_transform(x)

#Разбиваем df на тренировочную и тестовую выборку
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)

In [14]:
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train,y_train)

## Метрики модели

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [19]:
y_pred = sgdmodel.predict(X_test)

# Гармоническое среднее между точностью и полнотой
f1 = f1_score(y_test, y_pred, average='weighted')
# Доля правильно классифицированных образцов
accuracy = accuracy_score(y_test, y_pred)
# Доля истинно положительных предсказаний среди всех положительных предсказаний.
precision = precision_score(y_test, y_pred, average='weighted')
# Доля истинно положительных предсказаний, найденных моделью
recall = recall_score(y_test, y_pred, average='weighted')

print('F1 score:', f1)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)

F1 score: 0.7513843514469873
Accuracy: 0.776232393408375
Precision: 0.7568360264493605
Recall: 0.776232393408375
