# Загрузка файла

In [1]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [2]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d venkatasubramanian/automatic-ticket-classification

Downloading automatic-ticket-classification.zip to /content
 78% 11.0M/14.1M [00:00<00:00, 56.5MB/s]
100% 14.1M/14.1M [00:00<00:00, 65.6MB/s]


In [3]:
!pip install dill

Collecting dill
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m61.4/115.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.7


In [4]:
import pandas as pd
import numpy as np
import zipfile
import json
import re
import warnings

import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [5]:
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [6]:
with zipfile.ZipFile('/content/automatic-ticket-classification.zip') as myzip:
  with myzip.open('complaints-2021-05-14_08_16_.json') as myfile:
      data = json.load(myfile)
      df = pd.json_normalize(data)

In [7]:
df.head()

Unnamed: 0,_index,_type,_id,_score,_source.tags,_source.zip_code,_source.complaint_id,_source.issue,_source.date_received,_source.state,...,_source.company_response,_source.company,_source.submitted_via,_source.date_sent_to_company,_source.company_public_response,_source.sub_product,_source.timely,_source.complaint_what_happened,_source.sub_issue,_source.consumer_consent_provided
0,complaint-public-v2,complaint,3211475,0.0,,90301,3211475,Attempts to collect debt not owed,2019-04-13T12:00:00-05:00,CA,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2019-04-13T12:00:00-05:00,,Credit card debt,Yes,,Debt is not yours,Consent not provided
1,complaint-public-v2,complaint,3229299,0.0,Servicemember,319XX,3229299,Written notification about debt,2019-05-01T12:00:00-05:00,GA,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2019-05-01T12:00:00-05:00,,Credit card debt,Yes,Good morning my name is XXXX XXXX and I apprec...,Didn't receive enough information to verify debt,Consent provided
2,complaint-public-v2,complaint,3199379,0.0,,77069,3199379,"Other features, terms, or problems",2019-04-02T12:00:00-05:00,TX,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2019-04-02T12:00:00-05:00,,General-purpose credit card or charge card,Yes,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Problem with rewards from credit card,Consent provided
3,complaint-public-v2,complaint,2673060,0.0,,48066,2673060,Trouble during payment process,2017-09-13T12:00:00-05:00,MI,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2017-09-14T12:00:00-05:00,,Conventional home mortgage,Yes,,,Consent not provided
4,complaint-public-v2,complaint,3203545,0.0,,10473,3203545,Fees or interest,2019-04-05T12:00:00-05:00,NY,...,Closed with explanation,JPMORGAN CHASE & CO.,Referral,2019-04-05T12:00:00-05:00,,General-purpose credit card or charge card,Yes,,Charged too much interest,


In [8]:
df['_source.complaint_what_happened'] = df['_source.complaint_what_happened'].replace({'':np.nan})
df = df[~df['_source.complaint_what_happened'].isnull()]
df.shape

(21072, 22)

# Подготовка пайплайна

In [9]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

class TextCleaner(BaseEstimator, TransformerMixin):
  def __init__(self):
    self.lemmatizer = WordNetLemmatizer()

  def fit(self, X, y=None):
    return self

  def clean_text(self, X):
    X = str(X)
    clean_string = ''
    clean_string = X.lower()
    clean_string = re.sub('[^\w\s]', '', clean_string) # удаление знаков препинания
    clean_string = re.sub('\w*\d\w', '', clean_string) # удаление слов с цифрами
    clean_string = re.sub('x+x', '', clean_string) # удаление анонимных персональных данных
    return clean_string

  def lemmatize_text(self, X):
    word_list = nltk.word_tokenize(X)
    word_list = [word for word in word_list if word not in stop_words] # удаление стоп-слов
    word_list = [self.lemmatizer.lemmatize(word) for word in word_list] # леммантизация
    pos_tags = pos_tag(word_list)
    lemmatized_output = ' '.join([pos_tag[0] for pos_tag in pos_tags if pos_tag[1] in ['NN', 'VB']]) # оставляем только имена существительные и глаголы
    return lemmatized_output

  def transform(self, X, y=None):
    X = X.apply(self.clean_text)
    X = X.apply(self.lemmatize_text)
    return X

In [10]:
text_for_topics = df['_source.complaint_what_happened'].copy()

In [11]:
preprocess_pipeline = Pipeline([('TextCleaner', TextCleaner())])

text_for_topics = preprocess_pipeline.fit_transform(text_for_topics)

In [12]:
text_for_topics

1        morning name appreciate help put stop chase ba...
2        card agent date change agent information order...
10       chase card application identity consent obtain...
11       book ticket offer ticket card information offe...
14       son give check deposit chase account fund chas...
                               ...                        
78303    chase card customer decade solicitation credit...
78309    wednesday chas visa credit card provider claim...
78310    pay risk consumer chase bank chase year trust ...
78311    flawless credit chase credit card chase freedo...
78312    year account jp bank order line credit pay acc...
Name: _source.complaint_what_happened, Length: 21072, dtype: object

# Моделирование тем

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=2, # игнорировать слова, которые встреачются менее чем в 2 документах
                        max_df=0.95, # игнорировать слова, которые встречаются более чем в 95% документов
                        stop_words='english')

# Document Term Matrix
dtm = tfidf.fit_transform(text_for_topics)

In [14]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=5, random_state=17)
nmf_model.fit(dtm)
nmf_features = nmf_model.transform(dtm)

In [15]:
y = nmf_features.argmax(axis=1)
y

array([0, 1, 1, ..., 3, 4, 4])

In [16]:
text_for_topics = text_for_topics.rename('text')

# Обучение модели

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_for_topics,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=17)

print('Train data: ', X_train.shape)
print('Test data: ', X_test.shape)

Train data:  (15804,)
Test data:  (5268,)


In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

lr_pipeline = make_pipeline(
    TfidfVectorizer(), # содержит в себе CountVectorizer() -> TfidfTransformer
    LogisticRegression(random_state=17, solver='newton-cg')
)

%time lr_pipeline.fit(X_train, y_train)

print(classification_report(y_test, lr_pipeline.predict(X_test)))

CPU times: user 2.15 s, sys: 995 ms, total: 3.14 s
Wall time: 2.54 s
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1325
           1       0.97      0.96      0.97      1271
           2       0.96      0.97      0.97       854
           3       0.95      0.95      0.95      1184
           4       0.97      0.92      0.95       634

    accuracy                           0.96      5268
   macro avg       0.96      0.96      0.96      5268
weighted avg       0.96      0.96      0.96      5268



# Сохранение файлов

In [19]:
X_test.to_csv("X_test.csv", index=None)
pd.DataFrame(y_test).to_csv("y_test.csv", index=None)

In [20]:
import dill
dill._dill._reverse_typemap['ClassType'] = type

with open("preprocess_pipeline.dill", "wb") as f:
    dill.dump(preprocess_pipeline, f)

with open("lr_pipeline.dill", "wb") as f:
    dill.dump(lr_pipeline, f)