## Imports

In [1]:
import pandas as pd

## download and read the CSV file and make ID the index column

In [2]:
df = pd.read_csv('../data/auto.csv')
df.head()

Unnamed: 0,ID,CarNumber,Make_n_model,Refund,Fines,History
0,0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,4,E34877152RUS,Ford Focus,2.0,6100.0,


In [3]:
df = df.set_index('ID')

## count the number of observations using the method count()

In [4]:
df.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## drop the duplicates, taking into account only the following columns: CarNumber, Make_n_model, Fines

In [5]:
df = df.drop_duplicates(['CarNumber', 'Make_n_model', 'Fines'], keep='last')
df.head()

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
5,92918M178RUS,Ford Focus,1.0,5700.0,


In [6]:
df.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

## work with missing values

In [7]:
df.isnull().sum()

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

In [8]:
df = df.dropna(thresh=500, axis=1)
df.isnull().sum()

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

In [9]:
df['Refund'] = df['Refund'].fillna(method = 'ffill')
df.isnull().sum()

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

In [10]:
df['Fines'] = df['Fines'].fillna(df['Fines'].mean())
df.isnull().sum()

CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

## split and parse the make and model

In [14]:
df['Make_n_model'].unique()

array(['Ford Focus', 'Toyota Camry', 'Skoda Octavia', 'Volkswagen Passat',
       'Volkswagen Golf', 'Volkswagen', 'Volkswagen Jetta',
       'Volkswagen Touareg', 'Toyota Corolla', 'Audi', 'Ford Mondeo',
       'Volvo', 'BMW'], dtype=object)

In [15]:
def split_Make(M_n_m):
  make = M_n_m.split()[0]
  return make

def split_Model(M_n_m):
  s = M_n_m.split()
  if (len(s) > 1):
    model = ' '.join(s[1:])
    return model
  return None

In [16]:
df['Make'] = df['Make_n_model'].apply(split_Make)
df['Model'] = df['Make_n_model'].apply(split_Model)
df.head()

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,Ford,Focus
1,E432XX77RUS,Toyota Camry,1.0,6500.0,Toyota,Camry
2,7184TT36RUS,Ford Focus,1.0,2100.0,Ford,Focus
3,X582HE161RUS,Ford Focus,2.0,2000.0,Ford,Focus
5,92918M178RUS,Ford Focus,1.0,5700.0,Ford,Focus


In [17]:
df = df.drop('Make_n_model', axis=1)
df.head()

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.0,Ford,Focus
1,E432XX77RUS,1.0,6500.0,Toyota,Camry
2,7184TT36RUS,1.0,2100.0,Ford,Focus
3,X582HE161RUS,2.0,2000.0,Ford,Focus
5,92918M178RUS,1.0,5700.0,Ford,Focus


In [18]:
df.to_json('../data/auto.json', orient='records')