In [2]:
import pandas as pd

## Download csv-file

In [54]:
df = pd.read_csv(
    '../../data/auto.csv',
    index_col='ID'
)

## Count the number of observations

In [55]:
observations = df.count()

print(observations)

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64


## Drop the duplicates

In [56]:
df = df.drop_duplicates(
    subset=['CarNumber', 'Make_n_model', 'Fines'],
    keep='last'
)

In [57]:
observations = df.count()

print(observations)

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64


## Work with missing values

In [58]:
missing_values = df.isna().sum()
print(missing_values)

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64


In [59]:
df = df.dropna(axis=1, thresh=len(df)-500)

print(df.isna().sum())

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64


In [60]:
df['Refund'] = df['Refund'].ffill()
print(df.isna().sum())

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64


In [61]:
mean_fines = df['Fines'].mean()
df['Fines'] = df['Fines'].fillna(value=mean_fines)
print(df.isna().sum())


CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64


## Split and parse the Make_n_model

In [62]:
def split_make_model(row):
    parts = row['Make_n_model'].split(' ', 1)
    return pd.Series({'Make' : parts[0], 'Model' : parts[1] if len(parts) > 1 else None})

df[['Make', 'Model']] = df.apply(split_make_model, axis=1)


In [63]:
df.drop('Make_n_model', axis=1, inplace=True)

In [64]:
df.head()

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.0,Ford,Focus
1,E432XX77RUS,1.0,6500.0,Toyota,Camry
2,7184TT36RUS,1.0,2100.0,Ford,Focus
3,X582HE161RUS,2.0,2000.0,Ford,Focus
5,92918M178RUS,1.0,5700.0,Ford,Focus


In [66]:
df.reset_index(drop=True).to_json(
    'auto.json',
    orient='records',
    double_precision=4
)