In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [48]:
class CarData:
    
    missing = -1
    duplicates = -1
    
    price_outliers = -1
    mileage_outliers = -1
    year_outliers = -1
    total_discard = -1
    
    def __init__(self, path, price_outlier_mt = 200000, mileage_outlier_mt = 400000,
                 year_outlier_lt = 1995, engine_outlier_mt = 4000, engine_outlier_lt = 750):
        
        #define outliers values
        self.price_outlier_mt = price_outlier_mt
        self.mileage_outlier_mt = mileage_outlier_mt
        self.year_outlier_lt =  year_outlier_lt
        self.engine_outlier_mt = engine_outlier_mt
        self.engine_outlier_lt = engine_outlier_lt
        
        #defince variable data types
        self.numeric_variables = ['price', 'mileage_km', 'engine_cm3', 'year']
        self.categorical_variables = ['engine_type', 'city', 'province']
        
        #read all .csv files from the directory
        self.data = pd.concat(map(pd.read_csv, glob.glob(os.path.join(path, "*.csv"))), sort=False)
        
        #drop the duplicates and save the number of duplicates - many duplicates due to data gathering method
        self.duplicates = len(self.data) - len(self.data.drop_duplicates())
        self.data.drop_duplicates(inplace = True)
        
        #rename columns
        self.data.columns = ['title', 'price', 'sub_title', 'mileage_km', 'year', 'engine_cm3',
                'engine_type', 'city', 'province', 'negotiable']
        
        #drop NaNs and save the number of rows dropped to the missing varaible
        self.missing = self.data['engine_type'].isna().sum()
        self.data.dropna(subset = ['engine_type'], axis = 'index', inplace = True)
        
        self.missing = self.missing + self.data['city'].isna().sum()
        self.data.dropna(subset = ['city'], axis = 'index', inplace = True)
        
        self.missing = self.missing + self.data['engine_cm3'].isna().sum()
        self.data.dropna(subset = ['engine_cm3'], axis = 'index', inplace = True)
        
        #clean up the columns
        self.data['price'] = self.data['price'].apply(lambda x: x.replace(",", ""))
        self.data['price'] = self.data['price'].apply(lambda x: x.replace(" ", "")).astype('int')
        
        self.data['mileage_km'] = self.data['mileage_km'].apply(lambda x: x.replace("km", ""))
        self.data['mileage_km'] = self.data['mileage_km'].apply(lambda x: x.replace(" ", "")).astype('float')
        
        self.data['engine_cm3'] = self.data['engine_cm3'].astype('str')
        self.data['engine_cm3'] = self.data['engine_cm3'].apply(lambda x: x.replace('cm3', ''))
        self.data['engine_cm3'] = self.data['engine_cm3'].apply(lambda x: x.replace(' ','')).astype('int')
        
        self.data['province'] = self.data['province'].astype('str')
        self.data['province'] = self.data['province'].apply(lambda x: x.replace('(',''))
        self.data['province'] = self.data['province'].apply(lambda x: x.replace(')',''))
    
        #Add ID column
        self.data.insert(loc = 0, column = 'ID', value = range(1, len(self.data)+1))

        #discard outliers and calculate the numbers
        self.total_discard = len(self.data) - len(self.data[(self.data['price'] <= self.price_outlier_mt) &
                                                        (self.data['mileage_km'] <= self.mileage_outlier_mt) &
                                                        (self.data['year'] >= self.year_outlier_lt) &
                                                        (self.data['engine_cm3'] <= self.engine_outlier_mt) &
                                                        (self.data['engine_cm3'] >= self.engine_outlier_lt)])        
        
        self.price_outliers = len(self.data[self.data['price'] > price_outlier_mt])
        self.data = self.data[self.data['price'] <= price_outlier_mt]
        
        self.mileage_outliers = len(self.data[self.data['mileage_km'] > mileage_outlier_mt])
        self.data = self.data[self.data['mileage_km'] <= mileage_outlier_mt]
        
        self.year_outliers = len(self.data[self.data['year'] < year_outlier_lt])
        self.data = self.data[self.data['year'] >= year_outlier_lt]
        
        self.engine_outliers = len(self.data[(self.data['engine_cm3'] > engine_outlier_mt) |
                                            (self.data['engine_cm3'] < engine_outlier_lt)])
        self.data = self.data[(self.data['engine_cm3'] <= engine_outlier_mt) & 
                             (self.data['engine_cm3'] >= engine_outlier_lt)]
        
    def describe(self):
        desc_stats = round(pd.DataFrame(
                        data = self.data[['price', 'mileage_km', 'year', 'engine_cm3']].describe(),
                        columns = self.data[['price', 'mileage_km', 'year', 'engine_cm3']].columns),2)
        return desc_stats
    
    def outliers(self):
        print('Offers with price greater than '+str(self.price_outlier_mt)+' have been discarded')
        print('The number of such offers = '+str(self.price_outliers))
        print('')
        print('Offers with mileage greater than '+str(self.mileage_outlier_mt)+' have been discarded')
        print('The number of such offers = '+str(self.mileage_outliers))
        print('')
        print('Offers with year lower than '+str(self.year_outlier_lt)+' have been discarded')
        print('The number of such offers = '+str(self.year_outliers))
        print('')
        print('Offers with engine_cm3 greater than '+str(self.engine_outlier_lt)+
              ' and lower than '+str(self.engine_outlier_mt)+' have been discarded')
        print('The number of such offers = '+str(self.engine_outliers))
        print('')
        print('Total number of discarded offers = '+str(self.total_discard)
              +'('+str(round(self.total_discard/len(self.data)*100,2))+'%)'
              +' - may be different to the sum of above due to overlap')
        
    def scatter_nox(self, var = 'all', figsize_1 = 7, figsize_2 = 5):
        if var == 'all':
            plt.rcParams["figure.figsize"] = (figsize_1,figsize_2)
            plt.scatter(y = self.data['mileage_km'], x = range(1, len(self.data)+1), s=1)
            plt.title('mileage_km')
            plt.show()

            plt.scatter(y = self.data['price'], x = range(1, len(self.data)+1), s=1)
            plt.title('price')
            plt.show()

            plt.scatter(y = self.data['year'], x = range(1, len(self.data)+1), s=1)
            plt.title('year')
            plt.show()

            plt.scatter(y = self.data['engine_cm3'], x = range(1, len(self.data)+1), s=1)
            plt.title('engine_cm3')
            plt.show()
        else:
            plt.rcParams["figure.figsize"] = (figsize_1,figsize_2)
            plt.scatter(y = self.data[var], x = range(1, len(self.data)+1), s=1)
            plt.title(var)
            plt.show()
            
    def scatter(self, var = 'all'):
        list_comb = []
        if var == 'all':
            for variable1 in enumerate(self.numeric_variables):
                for variable2 in enumerate(self.numeric_variables):
                    if variable1 != variable2 and variable1[1]+variable2[1] not in list_comb and variable2[1]+variable1[1] not in list_comb:
                        plt.scatter(y = self.data[variable1[1]], x = self.data[variable2[1]], s=1)
                        plt.title("Correlation between "+variable1[1]+' and '+variable2[1])
                        plt.ylabel(variable1[1])
                        plt.xlabel(variable2[1])
                        plt.show()
                        list_comb.append(variable1[1]+variable2[1])
        #else: - TO DO
    
    def hist(self, var = 'all', bins = 50):
        if var == 'all':
            for variable in enumerate(self.numeric_variables):
                plt.hist(x = self.data[variable[1]], bins = bins)
                plt.title(variable[1])
                plt.show()
     
    def price_cat_vars(self):
        for variable in enumerate(self.categorical_variables):
            print(x.data.groupby(x.data[variable[1]])['price'].describe())
            #the variables need further preprocessing
            
        
                    

In [49]:
x = CarData('data/')

In [50]:
x.price_cat_vars()

               count          mean           std     min      25%      50%  \
engine_type                                                                  
Benzyna      11256.0  32406.949449  31329.019473   250.0  11500.0  22300.0   
Benzyna+CNG      5.0  29337.800000  19275.032794  7700.0  14999.0  30990.0   
Benzyna+LPG   1535.0  16891.985668  14049.850613   599.0   6900.0  13500.0   
Diesel       11055.0  34354.766169  32117.427547   500.0  13900.0  24900.0   
Hybryda        222.0  73328.869369  36008.947331  8500.0  43925.0  70000.0   

                 75%       max  
engine_type                     
Benzyna      41900.0  199900.0  
Benzyna+CNG  36000.0   57000.0  
Benzyna+LPG  22900.0  124600.0  
Diesel       42900.0  200000.0  
Hybryda      95900.0  196677.0  
                      count          mean           std      min      25%  \
city                                                                        
Adamów                  1.0  19500.000000           NaN  19500.0  19

In [33]:
x.data.groupby(x.data['engine_type']).describe()

Unnamed: 0_level_0,ID,ID,ID,ID,ID,ID,ID,ID,price,price,...,year,year,engine_cm3,engine_cm3,engine_cm3,engine_cm3,engine_cm3,engine_cm3,engine_cm3,engine_cm3
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
engine_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Benzyna,11256.0,12508.294332,7302.033575,1.0,6131.75,12558.0,18793.25,25197.0,11256.0,32406.949449,...,2015.0,2020.0,11256.0,1634.298507,499.55986,796.0,1332.0,1595.0,1840.0,4000.0
Benzyna+CNG,5.0,14500.6,7782.821294,3364.0,11690.0,14801.0,18395.0,24253.0,5.0,29337.8,...,2015.0,2017.0,5.0,1354.8,171.083898,1149.0,1242.0,1390.0,1395.0,1598.0
Benzyna+LPG,1535.0,12502.054072,7329.445103,24.0,6407.0,12027.0,19109.5,25199.0,1535.0,16891.985668,...,2009.0,2020.0,1535.0,1847.354397,596.091964,796.0,1400.0,1781.0,1998.0,4000.0
Diesel,11055.0,12631.52664,7272.205695,2.0,6335.5,12605.0,18988.5,25198.0,11055.0,34354.766169,...,2014.0,2020.0,11055.0,1936.532519,392.03247,799.0,1600.0,1968.0,1998.0,4000.0
Hybryda,222.0,13855.013514,6948.461502,210.0,8282.0,15691.5,19207.25,25034.0,222.0,73328.869369,...,2019.0,2020.0,222.0,1987.067568,488.60519,1200.0,1798.0,1798.0,1999.75,3500.0


In [None]:
x = CarData('data/')

In [None]:
x.outliers()

In [20]:
x.missing, x.duplicates

(737, 8782)

In [None]:
x.describe()

In [None]:
x.scatter_nox()

In [None]:
plt.hist(x.data['mileage_km'], bins = 100)
plt.title('abc')
plt.show()

plt.hist(x.data['price'],bins = 100)
plt.show()

plt.hist(x.data['year'], bins = 100)
plt.show()

plt.hist(x.data['engine_cm3'], bins = 100)
plt.show()

In [None]:
x.data[x.data['price']>1100000]

In [None]:
x.data[x.data['mileage_km']>1500000]

In [None]:
x.data[x.data['year']<1990]

In [None]:
round(pd.DataFrame(
    data = x.data[['price', 'mileage_km', 'year', 'engine_cm3']].describe(),
    columns = x.data[['price', 'mileage_km', 'year', 'engine_cm3']].columns),2)

In [None]:
x.data.isnull().sum()

In [None]:
data = pd.read_csv('run_results_21-04-2020.csv')

In [None]:
data = ReadAndPreprocess(df)

In [None]:
df = pd.read_csv('data/run_results_21-04-2020.csv')
df.head()

In [None]:
df2 = pd.read_csv('data/run_results_25-04_2020.csv')
df2.head()

In [None]:
df3 = pd.read_csv('data/run_results_26-04-2020.csv')
df3.head()

In [None]:
x.df_head()