In [2]:
import pandas as pd
import numpy as np 
import seaborn as sns 
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('klachten.csv')

In [4]:
y = df['Product']

In [5]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [6]:
train_df.shape

(11909, 5)

In [7]:
train_df.head(2).T

Unnamed: 0,6607,8800
ID,2018100,2291479
Datum_ontvangst,2024-07-18,2025-01-17
Product,Hypotheek,Incasso
Omschrijving,Purchased the house in 1988. I retired and cou...,Agency rep called my place of business and pre...
Antwoord_bedrijf,Closed with explanation,Closed with explanation


In [8]:
train_df['Product'].value_counts()

Product
Incasso               3275
Hypotheek             2651
Kredietregistratie    2527
Creditcard            1507
Bankrekening          1191
Consumentenkrediet     758
Name: count, dtype: int64

De grootste sector met klachten in deze train dataset is Incasso 

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11909 entries, 6607 to 10463
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                11909 non-null  int64 
 1   Datum_ontvangst   11909 non-null  object
 2   Product           11909 non-null  object
 3   Omschrijving      11909 non-null  object
 4   Antwoord_bedrijf  11909 non-null  object
dtypes: int64(1), object(4)
memory usage: 558.2+ KB


In [10]:
train_df['len'] = train_df['Omschrijving'].str.len()
train_df['len'].describe().round(2)

count    11909.00
mean      1019.96
std        910.56
min         12.00
25%        376.00
50%        715.00
75%       1361.00
max      10761.00
Name: len, dtype: float64

In [11]:
train_df.groupby('Product')['len'].mean().round(2)

Product
Bankrekening          1203.15
Consumentenkrediet    1071.34
Creditcard            1129.67
Hypotheek             1431.88
Incasso                776.10
Kredietregistratie     736.70
Name: len, dtype: float64

In [12]:
train_df['Omschrijving'].isna().sum()

np.int64(0)

In [13]:
pd.DataFrame(df.Product.unique()).values

array([['Bankrekening'],
       ['Consumentenkrediet'],
       ['Creditcard'],
       ['Hypotheek'],
       ['Incasso'],
       ['Kredietregistratie']], dtype=object)

In [14]:
explore_duplicates = train_df.copy()

In [15]:
explore_duplicates['Omschrijving'].duplicated().sum()

np.int64(46)

In [16]:
# houdt alleen de rijen waarvan 'Omschrijving' minstens twee keer voorkomt
explore_duplicates = explore_duplicates.loc[
    explore_duplicates['Omschrijving'].duplicated(keep=False)
]

In [17]:
explore_duplicates.shape

(79, 6)