In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
from sklearn.model_selection import train_test_split

In [2]:
# dataset inladen en omzetten in pandas dataframe
df = pd.read_csv('klachten.csv')

In [3]:
# Target variable definiëren
y = df['Product']

In [4]:
# Train-test split
train_df, test_df = train_test_split(
    df,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [5]:
# Aantal kolommen en rijen in de dataset
train_df.shape
print("De dataset bevat {} rijen en {} kolommen.".format(train_df.shape[0], train_df.shape[1]))

De dataset bevat 10420 rijen en 5 kolommen.


In [6]:
# Eerste vijf rijen van de trainingsset bekijken
train_df.head()

Unnamed: 0,ID,Datum_ontvangst,Product,Omschrijving,Antwoord_bedrijf
8835,1580972,2023-09-25,Incasso,i have just received a copy of my credit repor...,Closed with explanation
6482,1691558,2023-12-10,Hypotheek,Paid {$420.00} for appraisal on home based upo...,Closed with explanation
12354,1776634,2024-02-07,Kredietregistratie,TransUnion is reporting a XXXX XXXX on my cred...,Closed with non-monetary relief
2315,1387917,2023-05-21,Consumentenkrediet,"To begin with, prior to entering into a vehicl...",Closed with non-monetary relief
6740,1522980,2023-08-17,Hypotheek,PNC Mortgage sent {$1500.00} to an insurance c...,Closed with explanation


In [7]:
# Datatypes en null-waarden in de trainingsset bekijken
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10420 entries, 8835 to 12191
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                10420 non-null  int64 
 1   Datum_ontvangst   10420 non-null  object
 2   Product           10420 non-null  object
 3   Omschrijving      10420 non-null  object
 4   Antwoord_bedrijf  10420 non-null  object
dtypes: int64(1), object(4)
memory usage: 488.4+ KB


In [8]:
# Statistische samenvatting van de numerieke kolommen in de trainingsset
train_df.describe()

Unnamed: 0,ID
count,10420.0
mean,1903638.0
std,343031.0
min,1290157.0
25%,1600479.0
50%,1922964.0
75%,2208303.0
max,2543247.0


In [9]:
# Verdeling van de Product-sectoren in de trainingsset
train_df['Product'].value_counts()

Product
Incasso               2865
Hypotheek             2320
Kredietregistratie    2211
Creditcard            1319
Bankrekening          1042
Consumentenkrediet     663
Name: count, dtype: int64

De grootste sector met klachten in deze train dataset is Incasso 

In [10]:
# Statistische informatie over de lengte van de klachtenbeschrijvingen
exploratie_df = train_df.copy()
exploratie_df['len'] = exploratie_df['Omschrijving'].str.len()
exploratie_df['len'].describe().round(2)

count    10420.00
mean      1016.16
std        910.31
min         12.00
25%        375.00
50%        709.00
75%       1354.00
max      10761.00
Name: len, dtype: float64

In [11]:
# Gemiddelde lengte van klachtenbeschrijvingen per Product-sector
exploratie_df.groupby('Product')['len'].mean().round(2)

Product
Bankrekening          1205.74
Consumentenkrediet    1059.79
Creditcard            1130.99
Hypotheek             1426.86
Incasso                770.72
Kredietregistratie     732.31
Name: len, dtype: float64

In [12]:
# Aantal null-waarden in de kolom 'Omschrijving'
exploratie_df['Omschrijving'].isna().sum()

np.int64(0)

In [13]:
# Duplicaten in de kolom 'Omschrijving'
train_df['Omschrijving'].duplicated().sum()

np.int64(39)

In [14]:
# Dataset kopiëren om duplicaten te verkennen
explore_duplicates = train_df.copy()

In [15]:
# houdt alleen de rijen waarvan 'Omschrijving' minstens twee keer voorkomt
explore_duplicates = explore_duplicates.loc[
    explore_duplicates['Omschrijving'].duplicated(keep=False)
]

In [16]:
# Vorm van de dataset met duplicaten
explore_duplicates.shape

(67, 5)

In [17]:
explore_duplicates.head()

Unnamed: 0,ID,Datum_ontvangst,Product,Omschrijving,Antwoord_bedrijf
9789,2256834,2024-12-21,Incasso,The debt they are attempting to collect is ove...,Closed with non-monetary relief
13050,1594298,2023-10-06,Kredietregistratie,I have been a victim of Identity Theft. I have...,Closed with non-monetary relief
14138,1621356,2023-10-23,Kredietregistratie,I have made over XXXX attempts to have the cre...,Closed with non-monetary relief
8077,2413909,2025-03-31,Incasso,I have not received any documentation validati...,Closed with explanation
12425,1497403,2023-07-31,Kredietregistratie,I am filing this complaint because I think wha...,Closed with explanation
